diff --git a/CHANGELOG.md b/CHANGELOG.md index 57427d39b..040499ed8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - improve(io): when available, default to using `soundfile` backend - improve(pipeline): do not extract embeddings when `max_speakers` is set to 1 +- improve(pipeline): optimize memory usage of most pipelines ([#1713](https://github.com/pyannote/pyannote-audio/pull/1713) by [@benniekiss](https://github.com/benniekiss/)) ## Version 3.2.0 (2024-05-08) diff --git a/pyannote/audio/core/inference.py b/pyannote/audio/core/inference.py index 0c3e9b212..e43e94f98 100644 --- a/pyannote/audio/core/inference.py +++ b/pyannote/audio/core/inference.py @@ -559,9 +559,6 @@ def aggregate( step=frames.step, ) - masks = 1 - np.isnan(scores) - scores.data = np.nan_to_num(scores.data, copy=True, nan=0.0) - # Hamming window used for overlap-add aggregation hamming_window = ( np.hamming(num_frames_per_chunk).reshape(-1, 1) @@ -613,11 +610,13 @@ def aggregate( ) # loop on the scores of sliding chunks - for (chunk, score), (_, mask) in zip(scores, masks): + for chunk, score in scores: # chunk ~ Segment # score ~ (num_frames_per_chunk, num_classes)-shaped np.ndarray # mask ~ (num_frames_per_chunk, num_classes)-shaped np.ndarray - + mask = 1 - np.isnan(score) + np.nan_to_num(score, copy=False, nan=0.0) + start_frame = frames.closest_frame(chunk.start + 0.5 * frames.duration) aggregated_output[start_frame : start_frame + num_frames_per_chunk] += (