diff --git a/CHANGELOG.md b/CHANGELOG.md
index 57427d39b..040499ed8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
 
 - improve(io): when available, default to using `soundfile` backend
 - improve(pipeline): do not extract embeddings when `max_speakers` is set to 1
+- improve(pipeline): optimize memory usage of most pipelines ([#1713](https://github.com/pyannote/pyannote-audio/pull/1713) by [@benniekiss](https://github.com/benniekiss/))
 
 ## Version 3.2.0 (2024-05-08)
 
diff --git a/pyannote/audio/core/inference.py b/pyannote/audio/core/inference.py
index 0c3e9b212..e43e94f98 100644
--- a/pyannote/audio/core/inference.py
+++ b/pyannote/audio/core/inference.py
@@ -559,9 +559,6 @@ def aggregate(
             step=frames.step,
         )
 
-        masks = 1 - np.isnan(scores)
-        scores.data = np.nan_to_num(scores.data, copy=True, nan=0.0)
-
         # Hamming window used for overlap-add aggregation
         hamming_window = (
             np.hamming(num_frames_per_chunk).reshape(-1, 1)
@@ -613,11 +610,13 @@ def aggregate(
         )
 
         # loop on the scores of sliding chunks
-        for (chunk, score), (_, mask) in zip(scores, masks):
+        for chunk, score in scores:
             # chunk ~ Segment
             # score ~ (num_frames_per_chunk, num_classes)-shaped np.ndarray
             # mask ~ (num_frames_per_chunk, num_classes)-shaped np.ndarray
-
+            mask = 1 - np.isnan(score)
+            np.nan_to_num(score, copy=False, nan=0.0)
+            
             start_frame = frames.closest_frame(chunk.start + 0.5 * frames.duration)
 
             aggregated_output[start_frame : start_frame + num_frames_per_chunk] += (