Spaces:

AIvry
/

MAPSS-measures

Sleeping

App Files Files Community

AIvry commited on Sep 15

Commit

236f287

verified ·

1 Parent(s): 7a64434

Upload 11 files

Browse files

Files changed (5) hide show

app.py +4 -6
argshield.py +2 -3
config.py +33 -30
engine.py +239 -194
models.py +323 -359

app.py CHANGED Viewed

@@ -88,7 +88,7 @@ def process_audio_files(zip_file, model_name, layer, alpha):
             model_defaults = {
                 "wavlm": 24, "wav2vec2": 24, "hubert": 24,
                 "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
-                "wav2vec2_xlsr": 24, "ast": 12
             }
             layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
@@ -184,7 +184,6 @@ def create_interface():
         | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
         | `hubert_base` | HuBERT Base | 12 | |
         | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
-        | `ast` | Audio Spectrogram Transformer | 12 | Music |
         ## Parameters
@@ -233,7 +232,7 @@ def create_interface():
                 model_dropdown = gr.Dropdown(
                     choices=["raw", "wavlm", "wav2vec2", "hubert",
                             "wavlm_base", "wav2vec2_base", "hubert_base",
-                            "wav2vec2_xlsr", "ast"],
                     value="wav2vec2_base",
                     label="Select embedding model"
                 )
@@ -265,8 +264,7 @@ def create_interface():
                         "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
                         "wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
                         "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
-                        "hubert_base": {"maximum": 12, "value": 12, "interactive": True},
-                        "ast": {"maximum": 12, "value": 12, "interactive": True}
                     }
                     config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
@@ -308,4 +306,4 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

             model_defaults = {
                 "wavlm": 24, "wav2vec2": 24, "hubert": 24,
                 "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
+                "wav2vec2_xlsr": 24
             }
             layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
         | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
         | `hubert_base` | HuBERT Base | 12 | |
         | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
         ## Parameters
                 model_dropdown = gr.Dropdown(
                     choices=["raw", "wavlm", "wav2vec2", "hubert",
                             "wavlm_base", "wav2vec2_base", "hubert_base",
+                            "wav2vec2_xlsr"],
                     value="wav2vec2_base",
                     label="Select embedding model"
                 )
                         "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
                         "wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
                         "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
+                        "hubert_base": {"maximum": 12, "value": 12, "interactive": True}
                     }
                     config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
 if __name__ == "__main__":
     demo = create_interface()
+    demo.launch()

argshield.py CHANGED Viewed

@@ -17,7 +17,6 @@ MODEL_DEFAULT_LAYER = {
     "wav2vec2_base": 12,
     "hubert_base": 12,
     "wav2vec2_xlsr": 24,
-    "ast": 12,
 }
 def _read_manifest_json(path: Path):
@@ -88,7 +87,7 @@ def _parse_args():
         required=True,
         help=("Embedding model. Choices: "
               "raw, wavlm, wav2vec2, hubert, wavlm_base, wav2vec2_base, "
-              "hubert_base, wav2vec2_xlsr, ast"),
     )
     parser.add_argument(
         "--layer",
@@ -141,4 +140,4 @@ def _validate_gpus(max_gpus_opt):
         raise SystemExit("--max-gpus must be an integer >= 0.")
     if mg < 0:
         raise SystemExit("--max-gpus must be >= 0.")
-    return mg

     "wav2vec2_base": 12,
     "hubert_base": 12,
     "wav2vec2_xlsr": 24,
 }
 def _read_manifest_json(path: Path):
         required=True,
         help=("Embedding model. Choices: "
               "raw, wavlm, wav2vec2, hubert, wavlm_base, wav2vec2_base, "
+              "hubert_base, wav2vec2_xlsr"),
     )
     parser.add_argument(
         "--layer",
         raise SystemExit("--max-gpus must be an integer >= 0.")
     if mg < 0:
         raise SystemExit("--max-gpus must be >= 0.")
+    return mg

config.py CHANGED Viewed

@@ -1,30 +1,33 @@
-import os
-import torch
-import warnings
-warnings.filterwarnings(
-    "ignore",
-    category=UserWarning,
-    message=r"^expandable_segments not supported on this platform"
-)
-SR = 16_000
-RESULTS_ROOT = "results"
-BATCH_SIZE = 2
-ENERGY_WIN_MS = 20
-ENERGY_HOP_MS = 20
-SILENCE_RATIO = 0.1
-EPS = 1e-4
-COV_TOL = 1e-6
-DEFAULT_LAYER = 2
-DEFAULT_ADD_CI = True
-DEFAULT_DELTA_CI = 0.05
-DEFAULT_ALPHA = 1.0
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.6"
-os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
-torch.backends.cudnn.benchmark = True
-torch.backends.cudnn.deterministic = False
-torch.backends.cudnn.enabled = True

+import os
+import torch
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message=r"^expandable_segments not supported on this platform"
+)
+SR = 16_000
+RESULTS_ROOT = "results"
+BATCH_SIZE = 2
+ENERGY_WIN_MS = 20
+ENERGY_HOP_MS = 20
+SILENCE_RATIO = 0.1
+EPS = 1e-4
+COV_TOL = 1e-6
+DEFAULT_LAYER = 2
+DEFAULT_ADD_CI = True
+DEFAULT_DELTA_CI = 0.05
+DEFAULT_ALPHA = 1.0
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.6"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.enabled = True
+if torch.cuda.is_available():
+    torch.cuda.set_per_process_memory_fraction(0.8)

engine.py CHANGED Viewed

@@ -4,6 +4,7 @@ from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 import librosa
 import pandas as pd
 from audio import (
     assign_outputs_to_refs_by_corr,
     loudness_normalize,
@@ -73,7 +74,7 @@ def compute_mapss_measures(
     if algos is None:
         algos_to_run = sorted(
-            {algo for m in canon_mix for algo in (m.systems or {}).keys()}
         )
     else:
         algos_to_run = list(algos)
@@ -132,6 +133,7 @@ def compute_mapss_measures(
     win = int(ENERGY_WIN_MS * SR / 1000)
     hop = int(ENERGY_HOP_MS * SR / 1000)
     voiced_mask_mix = []
     for i, mix in enumerate(mixture_entries):
         if verbose:
@@ -142,6 +144,7 @@ def compute_mapss_measures(
                 refs_for_mix = [all_refs[e["id"]].cuda() for e in mix]
                 mask = make_union_voiced_mask(refs_for_mix, win, hop)
                 voiced_mask_mix.append(mask.cpu())
                 # Explicitly delete GPU tensors
                 for ref in refs_for_mix:
                     del ref
@@ -150,21 +153,33 @@ def compute_mapss_measures(
             refs_for_mix = [all_refs[e["id"]].cpu() for e in mix]
             mask = make_union_voiced_mask(refs_for_mix, win, hop)
             voiced_mask_mix.append(mask.cpu())
     ordered_speakers = [e["id"] for e in flat_entries]
-    for algo_idx, algo in enumerate(algos_to_run):
-        if verbose:
-            print(f"\nProcessing Algorithm {algo_idx + 1}/{len(algos_to_run)}: {algo}")
-        algo_dir = os.path.join(exp_root, algo)
-        os.makedirs(algo_dir, exist_ok=True)
-        all_outs = {}
-        missing = []
-        for mix_idx, mix in enumerate(mixture_entries):
-            for e in mix:
                 assigned_path = e.get("outs", {}).get(algo)
                 if assigned_path is None:
                     missing.append((e["mixture"], e["id"]))
@@ -173,49 +188,49 @@ def compute_mapss_measures(
                 wav, _ = librosa.load(str(assigned_path), sr=SR)
                 all_outs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
-        if missing:
-            msg = f"[{algo}] missing outputs for {len(missing)} speaker(s)"
-            if on_missing == "error":
-                raise FileNotFoundError(msg)
-            else:
-                if verbose:
-                    warnings.warn(msg + " Skipping those speakers.")
-        if not all_outs:
-            if verbose:
-                warnings.warn(f"[{algo}] No outputs provided. Skipping algorithm.")
-            continue
-        ps_ts = {m: {s: [] for s in ordered_speakers} for m in models}
-        pm_ts = {m: {s: [] for s in ordered_speakers} for m in models}
-        ps_bias_ts = {m: {s: [] for s in ordered_speakers} for m in models}
-        ps_prob_ts = {m: {s: [] for s in ordered_speakers} for m in models}
-        pm_bias_ts = {m: {s: [] for s in ordered_speakers} for m in models}
-        pm_prob_ts = {m: {s: [] for s in ordered_speakers} for m in models}
-        for model_idx, mname in enumerate(models):
-            if verbose:
-                print(f"  Processing Model {model_idx + 1}/{len(models)}: {mname}")
-            for metric_type in ["PS", "PM"]:
-                clear_gpu_memory()
-                gc.collect()
-                model_wrapper, layer_eff = load_model(mname, layer, max_gpus)
-                get_gpu_memory_info(verbose)
-                embs_by_mix = {}
-                labels_by_mix = {}
-                for k, mix in enumerate(mixture_entries):
-                    speakers_this_mix = [e for e in mix if e["id"] in all_outs]
                     if not speakers_this_mix:
                         continue
                     if verbose:
-                        print(
-                            f"Processing mixture {k + 1}/{len(mixture_entries)} for {metric_type}"
-                        )
                     all_signals_mix = []
                     all_masks_mix = []
@@ -240,7 +255,7 @@ def compute_mapss_measures(
                         sigs = [all_refs[s].numpy(), all_outs[s].numpy()] + dists
                         lbls = ["ref", "out"] + [f"d{i}" for i in range(len(dists))]
-                        masks = [voiced_mask_mix[k]] * len(sigs)
                         all_signals_mix.extend(sigs)
                         all_masks_mix.extend(masks)
                         all_labels_mix.extend([f"{s}-{l}" for l in lbls])
@@ -269,12 +284,124 @@ def compute_mapss_measures(
                         if embeddings_list:
                             embeddings = torch.cat(embeddings_list, dim=0)
-                            embs_by_mix[k] = embeddings
-                            labels_by_mix[k] = all_labels_mix
                     except Exception as ex:
                         if verbose:
-                            print(f"      ERROR processing mixture {k + 1}: {ex}")
                         continue
                     finally:
                         # Always clean up after processing a mixture
@@ -284,155 +411,73 @@ def compute_mapss_measures(
                         clear_gpu_memory()
                         gc.collect()
-                if verbose:
-                    print(f"    Computing {metric_type} scores for {mname}...")
-                # Process mixtures with their stored embeddings and labels
-                with ThreadPoolExecutor(
-                        max_workers=min(2, ngpu if ngpu > 0 else 1)
-                ) as executor:
-                    for k in range(len(mixture_entries)):
-                        if k not in embs_by_mix:
-                            continue
-                        E, L, D = embs_by_mix[k].shape
-                        if L == 0:
-                            if verbose:
-                                print(f"        WARNING: mixture {k + 1} produced 0 frames after masking; skipping.")
-                            continue
-                        # Get the labels for this mixture
-                        labels_for_mix = labels_by_mix[k]
-                        def process_frame(f, embeddings_mix, labels_mix):
-                            try:
-                                frame_emb = embeddings_mix[:, f, :].detach().cpu().numpy()
-                                if add_ci:
-                                    coords_d, coords_c, eigvals, k_sub_gauss = (
-                                        gpu_distributor.execute_on_gpu(
-                                            diffusion_map_torch,
-                                            frame_emb,
-                                            labels_mix,
-                                            alpha=alpha,
-                                            eig_solver="full",
-                                            return_eigs=True,
-                                            return_complement=True,
-                                            return_cval=add_ci,
-                                        )
-                                    )
-                                else:
-                                    coords_d = gpu_distributor.execute_on_gpu(
-                                        diffusion_map_torch,
-                                        frame_emb,
-                                        labels_mix,
-                                        alpha=alpha,
-                                        eig_solver="full",
-                                        return_eigs=False,
-                                        return_complement=False,
-                                        return_cval=False,
-                                    )
-                                    coords_c = None
-                                    eigvals = None
-                                    k_sub_gauss = 1
-                                if metric_type == "PS":
-                                    score = compute_ps(
-                                        coords_d, labels_mix, max_gpus
-                                    )
-                                    bias = prob = None
-                                    if add_ci:
-                                        bias, prob = ps_ci_components_full(
-                                            coords_d,
-                                            coords_c,
-                                            eigvals,
-                                            labels_mix,
-                                            delta=DEFAULT_DELTA_CI,
-                                        )
-                                    return f, "PS", score, bias, prob
-                                else:
-                                    score = compute_pm(
-                                        coords_d, labels_mix, "gamma", max_gpus
-                                    )
-                                    bias = prob = None
-                                    if add_ci:
-                                        bias, prob = pm_ci_components_full(
-                                            coords_d,
-                                            coords_c,
-                                            eigvals,
-                                            labels_mix,
-                                            delta=DEFAULT_DELTA_CI,
-                                            K=k_sub_gauss,
-                                        )
-                                    return f, "PM", score, bias, prob
-                            except Exception as ex:
-                                if verbose:
-                                    print(f"        ERROR frame {f + 1}: {ex}")
-                                return None
-                        futures = [
-                            executor.submit(process_frame, f, embs_by_mix[k], labels_for_mix)
-                            for f in range(L)
-                        ]
-                        for fut in futures:
-                            result = fut.result()
-                            if result is None:
-                                continue
-                            f, metric, score, bias, prob = result
-                            if metric == "PS":
-                                for sp in score:
-                                    ps_ts[mname][sp].append(score[sp])
-                                    if add_ci and bias is not None:
-                                        ps_bias_ts[mname][sp].append(bias[sp])
-                                        ps_prob_ts[mname][sp].append(prob[sp])
-                            else:
-                                for sp in score:
-                                    pm_ts[mname][sp].append(score[sp])
-                                    if add_ci and bias is not None:
-                                        pm_bias_ts[mname][sp].append(bias[sp])
-                                        pm_prob_ts[mname][sp].append(prob[sp])
-                    # Clean up after processing all mixtures for this metric
-                    del embs_by_mix, labels_by_mix
                     clear_gpu_memory()
                     gc.collect()
-                del model_wrapper
-                clear_gpu_memory()
-                gc.collect()
         if verbose:
-            print(f"  Saving results for {algo}...")
-        for m in models:
-            def _pad(vec, n):
-                return vec + [np.nan] * (n - len(vec))
-            max_len = 0
-            for s in ordered_speakers:
-                max_len = max(max_len, len(ps_ts[m][s]), len(pm_ts[m][s]))
-            pd.DataFrame(
-                {s: _pad(ps_ts[m][s], max_len) for s in ordered_speakers}
-            ).to_csv(os.path.join(algo_dir, f"ps_scores_{m}.csv"), index=False)
-            pd.DataFrame(
-                {s: _pad(pm_ts[m][s], max_len) for s in ordered_speakers}
-            ).to_csv(os.path.join(algo_dir, f"pm_scores_{m}.csv"), index=False)
-            if add_ci:
-                ci_cols = {}
-                for s in ordered_speakers:
-                    ci_cols[f"{s}_ps_bias"] = _pad(ps_bias_ts[m][s], max_len)
-                    ci_cols[f"{s}_ps_prob"] = _pad(ps_prob_ts[m][s], max_len)
-                    ci_cols[f"{s}_pm_bias"] = _pad(pm_bias_ts[m][s], max_len)
-                    ci_cols[f"{s}_pm_prob"] = _pad(pm_prob_ts[m][s], max_len)
-                pd.DataFrame(ci_cols).to_csv(
-                    os.path.join(algo_dir, f"ci_{m}.csv"), index=False
                 )
         del all_outs

 from datetime import datetime
 import librosa
 import pandas as pd
+import numpy as np
 from audio import (
     assign_outputs_to_refs_by_corr,
     loudness_normalize,
     if algos is None:
         algos_to_run = sorted(
+            {algo for algo in canon_mix[0].systems.keys()} if canon_mix and canon_mix[0].systems else []
         )
     else:
         algos_to_run = list(algos)
     win = int(ENERGY_WIN_MS * SR / 1000)
     hop = int(ENERGY_HOP_MS * SR / 1000)
     voiced_mask_mix = []
+    total_frames_per_mix = []  # Store total frames for each mixture
     for i, mix in enumerate(mixture_entries):
         if verbose:
                 refs_for_mix = [all_refs[e["id"]].cuda() for e in mix]
                 mask = make_union_voiced_mask(refs_for_mix, win, hop)
                 voiced_mask_mix.append(mask.cpu())
+                total_frames_per_mix.append(mask.shape[0])
                 # Explicitly delete GPU tensors
                 for ref in refs_for_mix:
                     del ref
             refs_for_mix = [all_refs[e["id"]].cpu() for e in mix]
             mask = make_union_voiced_mask(refs_for_mix, win, hop)
             voiced_mask_mix.append(mask.cpu())
+            total_frames_per_mix.append(mask.shape[0])
     ordered_speakers = [e["id"] for e in flat_entries]
+    # Initialize storage for all mixtures and algorithms
+    all_mixture_results = {}  # mixture_id -> {algo -> {model -> data}}
+    for mix_idx, (mix_canon, mix_entries) in enumerate(zip(canon_mix, mixture_entries)):
+        mixture_id = mix_canon.mixture_id
+        all_mixture_results[mixture_id] = {}
+        # Get total frames for this mixture
+        total_frames = total_frames_per_mix[mix_idx]
+        # Get speakers for this mixture
+        mixture_speakers = [e["id"] for e in mix_entries]
+        for algo_idx, algo in enumerate(algos_to_run):
+            if verbose:
+                print(f"\nProcessing Mixture {mixture_id}, Algorithm {algo_idx + 1}/{len(algos_to_run)}: {algo}")
+            # Remove the old algo_dir creation here - we don't need these empty folders anymore
+            all_outs = {}
+            missing = []
+            for e in mix_entries:
                 assigned_path = e.get("outs", {}).get(algo)
                 if assigned_path is None:
                     missing.append((e["mixture"], e["id"]))
                 wav, _ = librosa.load(str(assigned_path), sr=SR)
                 all_outs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
+            if missing:
+                msg = f"[{algo}] missing outputs for {len(missing)} speaker(s) in mixture {mixture_id}"
+                if on_missing == "error":
+                    raise FileNotFoundError(msg)
+                else:
+                    if verbose:
+                        warnings.warn(msg + " Skipping those speakers.")
+            if not all_outs:
+                if verbose:
+                    warnings.warn(f"[{algo}] No outputs for mixture {mixture_id}. Skipping.")
+                continue
+            # Initialize storage for this algorithm
+            if algo not in all_mixture_results[mixture_id]:
+                all_mixture_results[mixture_id][algo] = {}
+            # Initialize frame-wise storage with NaN for all frames
+            ps_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
+            pm_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
+            ps_bias_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
+            ps_prob_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
+            pm_bias_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
+            pm_prob_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
+            for model_idx, mname in enumerate(models):
+                if verbose:
+                    print(f"  Processing Model {model_idx + 1}/{len(models)}: {mname}")
+                for metric_type in ["PS", "PM"]:
+                    clear_gpu_memory()
+                    gc.collect()
+                    model_wrapper, layer_eff = load_model(mname, layer, max_gpus)
+                    get_gpu_memory_info(verbose)
+                    # Process only this mixture
+                    speakers_this_mix = [e for e in mix_entries if e["id"] in all_outs]
                     if not speakers_this_mix:
                         continue
                     if verbose:
+                        print(f"    Processing {metric_type} for mixture {mixture_id}")
                     all_signals_mix = []
                     all_masks_mix = []
                         sigs = [all_refs[s].numpy(), all_outs[s].numpy()] + dists
                         lbls = ["ref", "out"] + [f"d{i}" for i in range(len(dists))]
+                        masks = [voiced_mask_mix[mix_idx]] * len(sigs)
                         all_signals_mix.extend(sigs)
                         all_masks_mix.extend(masks)
                         all_labels_mix.extend([f"{s}-{l}" for l in lbls])
                         if embeddings_list:
                             embeddings = torch.cat(embeddings_list, dim=0)
+                            E, L, D = embeddings.shape
+                            if L == 0:
+                                if verbose:
+                                    print(
+                                        f"        WARNING: mixture {mixture_id} produced 0 frames after masking; skipping.")
+                                continue
+                            # Get valid frame indices for this mixture
+                            mask = voiced_mask_mix[mix_idx]
+                            valid_frame_indices = torch.where(mask)[0].tolist()
+                            if verbose:
+                                print(f"    Computing {metric_type} scores for {mname}...")
+                            # Process frames with their stored embeddings and labels
+                            with ThreadPoolExecutor(
+                                    max_workers=min(2, ngpu if ngpu > 0 else 1)
+                            ) as executor:
+                                def process_frame(f, frame_idx, embeddings_mix, labels_mix):
+                                    try:
+                                        frame_emb = embeddings_mix[:, f, :].detach().cpu().numpy()
+                                        if add_ci:
+                                            coords_d, coords_c, eigvals, k_sub_gauss = (
+                                                gpu_distributor.execute_on_gpu(
+                                                    diffusion_map_torch,
+                                                    frame_emb,
+                                                    labels_mix,
+                                                    alpha=alpha,
+                                                    eig_solver="full",
+                                                    return_eigs=True,
+                                                    return_complement=True,
+                                                    return_cval=add_ci,
+                                                )
+                                            )
+                                        else:
+                                            coords_d = gpu_distributor.execute_on_gpu(
+                                                diffusion_map_torch,
+                                                frame_emb,
+                                                labels_mix,
+                                                alpha=alpha,
+                                                eig_solver="full",
+                                                return_eigs=False,
+                                                return_complement=False,
+                                                return_cval=False,
+                                            )
+                                            coords_c = None
+                                            eigvals = None
+                                            k_sub_gauss = 1
+                                        if metric_type == "PS":
+                                            score = compute_ps(
+                                                coords_d, labels_mix, max_gpus
+                                            )
+                                            bias = prob = None
+                                            if add_ci:
+                                                bias, prob = ps_ci_components_full(
+                                                    coords_d,
+                                                    coords_c,
+                                                    eigvals,
+                                                    labels_mix,
+                                                    delta=DEFAULT_DELTA_CI,
+                                                )
+                                            return frame_idx, "PS", score, bias, prob
+                                        else:
+                                            score = compute_pm(
+                                                coords_d, labels_mix, "gamma", max_gpus
+                                            )
+                                            bias = prob = None
+                                            if add_ci:
+                                                bias, prob = pm_ci_components_full(
+                                                    coords_d,
+                                                    coords_c,
+                                                    eigvals,
+                                                    labels_mix,
+                                                    delta=DEFAULT_DELTA_CI,
+                                                    K=k_sub_gauss,
+                                                )
+                                            return frame_idx, "PM", score, bias, prob
+                                    except Exception as ex:
+                                        if verbose:
+                                            print(f"        ERROR frame {frame_idx}: {ex}")
+                                        return None
+                                futures = [
+                                    executor.submit(process_frame, f, valid_frame_indices[f], embeddings,
+                                                    all_labels_mix)
+                                    for f in range(L)
+                                ]
+                                for fut in futures:
+                                    result = fut.result()
+                                    if result is None:
+                                        continue
+                                    frame_idx, metric, score, bias, prob = result
+                                    if metric == "PS":
+                                        for sp in score:
+                                            if sp in mixture_speakers:
+                                                ps_frames[mname][sp][frame_idx] = score[sp]
+                                                if add_ci and bias is not None:
+                                                    ps_bias_frames[mname][sp][frame_idx] = bias[sp]
+                                                    ps_prob_frames[mname][sp][frame_idx] = prob[sp]
+                                    else:
+                                        for sp in score:
+                                            if sp in mixture_speakers:
+                                                pm_frames[mname][sp][frame_idx] = score[sp]
+                                                if add_ci and bias is not None:
+                                                    pm_bias_frames[mname][sp][frame_idx] = bias[sp]
+                                                    pm_prob_frames[mname][sp][frame_idx] = prob[sp]
                     except Exception as ex:
                         if verbose:
+                            print(f"      ERROR processing mixture {mixture_id}: {ex}")
                         continue
                     finally:
                         # Always clean up after processing a mixture
                         clear_gpu_memory()
                         gc.collect()
+                    del model_wrapper
                     clear_gpu_memory()
                     gc.collect()
+            # Store results for this mixture and algorithm
+            all_mixture_results[mixture_id][algo][mname] = {
+                'ps_frames': ps_frames[mname],
+                'pm_frames': pm_frames[mname],
+                'ps_bias_frames': ps_bias_frames[mname] if add_ci else None,
+                'ps_prob_frames': ps_prob_frames[mname] if add_ci else None,
+                'pm_bias_frames': pm_bias_frames[mname] if add_ci else None,
+                'pm_prob_frames': pm_prob_frames[mname] if add_ci else None,
+                'total_frames': total_frames
+            }
+        # Save results for this mixture after processing all algorithms
         if verbose:
+            print(f"  Saving results for mixture {mixture_id}...")
+        # Create timestamps in milliseconds - using lowercase hop
+        timestamps_ms = [i * hop * 1000 / SR for i in range(total_frames)]
+        for model in models:
+            # Prepare PS data
+            ps_data = {'timestamp_ms': timestamps_ms}
+            pm_data = {'timestamp_ms': timestamps_ms}
+            ci_data = {'timestamp_ms': timestamps_ms} if add_ci else None
+            # Combine data from all algorithms for this mixture
+            for algo in algos_to_run:
+                if algo not in all_mixture_results[mixture_id]:
+                    continue
+                if model not in all_mixture_results[mixture_id][algo]:
+                    continue
+                model_data = all_mixture_results[mixture_id][algo][model]
+                # Add PS data
+                for speaker in mixture_speakers:
+                    col_name = f"{algo}_{speaker}"
+                    ps_data[col_name] = model_data['ps_frames'][speaker]
+                    pm_data[col_name] = model_data['pm_frames'][speaker]
+                    if add_ci and ci_data is not None:
+                        ci_data[f"{algo}_{speaker}_ps_bias"] = model_data['ps_bias_frames'][speaker]
+                        ci_data[f"{algo}_{speaker}_ps_prob"] = model_data['ps_prob_frames'][speaker]
+                        ci_data[f"{algo}_{speaker}_pm_bias"] = model_data['pm_bias_frames'][speaker]
+                        ci_data[f"{algo}_{speaker}_pm_prob"] = model_data['pm_prob_frames'][speaker]
+            # Save CSV files for this mixture
+            mixture_dir = os.path.join(exp_root, mixture_id)
+            os.makedirs(mixture_dir, exist_ok=True)
+            pd.DataFrame(ps_data).to_csv(
+                os.path.join(mixture_dir, f"ps_scores_{model}.csv"),
+                index=False
+            )
+            pd.DataFrame(pm_data).to_csv(
+                os.path.join(mixture_dir, f"pm_scores_{model}.csv"),
+                index=False
+            )
+            if add_ci and ci_data is not None:
+                pd.DataFrame(ci_data).to_csv(
+                    os.path.join(mixture_dir, f"ci_{model}.csv"),
+                    index=False
                 )
         del all_outs

models.py CHANGED Viewed

@@ -1,360 +1,324 @@
-import queue
-import threading
-import gc
-import torch
-import torch.nn.functional as F
-from transformers import (
-    HubertModel,
-    Wav2Vec2FeatureExtractor,
-    Wav2Vec2Model,
-    WavLMModel,
-    ASTModel,
-    AutoFeatureExtractor,
-)
-from config import BATCH_SIZE, ENERGY_HOP_MS, ENERGY_WIN_MS, SR
-from utils import get_gpu_count
-class BalancedDualGPUModel:
-    def __init__(self, model_name, layer, max_gpus=None):
-        self.layer = layer
-        self.models = []
-        self.extractors = []
-        self.devices = []
-        ngpu = get_gpu_count(max_gpus)
-        # This class should only be used when GPUs are available
-        if ngpu == 0:
-            raise RuntimeError("BalancedDualGPUModel requires at least 1 GPU")
-        for gpu_id in range(min(ngpu, 2)):
-            device = f"cuda:{gpu_id}"
-            self.devices.append(device)
-            ckpt, cls, _ = get_model_config(layer)[model_name]
-            if cls is ASTModel:
-                extractor = AutoFeatureExtractor.from_pretrained(ckpt)
-            else:
-                extractor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt)
-            attn_impl = "eager" if cls in (WavLMModel, ASTModel) else "sdpa"
-            # Use float32 for better compatibility
-            model = cls.from_pretrained(
-                ckpt,
-                output_hidden_states=True,
-                use_safetensors=True,
-                torch_dtype=torch.float32,  # Changed from float16
-                low_cpu_mem_usage=True,
-                attn_implementation=attn_impl
-            )
-            model.eval()
-            model = model.to(device)
-            for param in model.parameters():
-                param.requires_grad = False
-            self.extractors.append(extractor)
-            self.models.append(model)
-        self.gpu_queues = [queue.Queue() for _ in range(len(self.devices))]
-        self.result_queue = queue.Queue()
-        self.workers = []
-        for i in range(len(self.devices)):
-            worker = threading.Thread(target=self._gpu_worker, args=(i,))
-            worker.daemon = True
-            worker.start()
-            self.workers.append(worker)
-    def _gpu_worker(self, gpu_id):
-        device = self.devices[gpu_id]
-        model = self.models[gpu_id]
-        extractor = self.extractors[gpu_id]
-        while True:
-            task = self.gpu_queues[gpu_id].get()
-            if task is None:
-                break
-            signals, masks, use_mlm, task_id = task
-            try:
-                inputs = extractor(
-                    signals, sampling_rate=SR, return_tensors="pt", padding=True
-                )
-                input_values = inputs.input_values.to(device, non_blocking=True)
-                torch.cuda.empty_cache()
-                orig_mode = model.training
-                model.train() if use_mlm else model.eval()
-                with torch.no_grad():
-                    # Only use autocast on actual GPUs with float16 support
-                    if torch.cuda.is_available() and device.startswith('cuda'):
-                        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
-                            hs = model(
-                                input_values, output_hidden_states=True
-                            ).hidden_states[self.layer]
-                    else:
-                        hs = model(
-                            input_values, output_hidden_states=True
-                        ).hidden_states[self.layer]
-                model.train(orig_mode)
-                B, T, D = hs.shape
-                keep = []
-                for b in range(B):
-                    mask_b = masks[b].float().unsqueeze(0).unsqueeze(0).to(device)
-                    mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
-                    keep.append(hs[b][mask_t].cpu())
-                # Aggressive cleanup
-                del hs, input_values, inputs
-                torch.cuda.empty_cache()
-                if keep:
-                    L_max = max(x.shape[0] for x in keep)
-                    keep_padded = [
-                        F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in keep
-                    ]
-                    result = torch.stack(keep_padded, dim=0)
-                else:
-                    result = torch.empty(0, 0, 0)
-                self.result_queue.put((task_id, result))
-            except Exception as e:
-                self.result_queue.put((task_id, e))
-            finally:
-                # Always clear cache after processing
-                torch.cuda.empty_cache()
-    def process_batch(self, signals, masks, use_mlm=False):
-        if not signals:
-            return torch.empty(0, 0, 0)
-        batch_size = len(signals)
-        split = (batch_size + len(self.devices) - 1) // len(self.devices)
-        results = {}
-        task_id = 0
-        for i in range(0, batch_size, split):
-            end = min(i + split, batch_size)
-            gpu_id = (i // split) % len(self.devices)
-            self.gpu_queues[gpu_id].put(
-                (signals[i:end], masks[i:end], use_mlm, task_id)
-            )
-            task_id += 1
-        for _ in range(task_id):
-            tid, result = self.result_queue.get()
-            if isinstance(result, Exception):
-                raise result
-            results[tid] = result
-        parts = [results[i] for i in range(task_id) if results[i].numel() > 0]
-        return torch.cat(parts, dim=0) if parts else torch.empty(0, 0, 0)
-    def cleanup(self):
-        """Explicit cleanup method"""
-        for q in self.gpu_queues:
-            q.put(None)
-        for w in self.workers:
-            w.join(timeout=5.0)
-        for model in self.models:
-            del model
-        for extractor in self.extractors:
-            del extractor
-        self.models.clear()
-        self.extractors.clear()
-        torch.cuda.empty_cache()
-        gc.collect()
-    def __del__(self):
-        self.cleanup()
-# NO CACHE - we need to clean up models properly between runs
-def get_model_config(layer):
-    return {
-        "raw": (None, None, None),
-        "wavlm": ("microsoft/wavlm-large", WavLMModel, layer),
-        "wav2vec2": ("facebook/wav2vec2-large-lv60", Wav2Vec2Model, layer),
-        "hubert": ("facebook/hubert-large-ll60k", HubertModel, layer),
-        "wavlm_base": ("microsoft/wavlm-base", WavLMModel, layer),
-        "wav2vec2_base": ("facebook/wav2vec2-base", Wav2Vec2Model, layer),
-        "hubert_base": ("facebook/hubert-base-ls960", HubertModel, layer),
-        "wav2vec2_xlsr": ("facebook/wav2vec2-large-xlsr-53", Wav2Vec2Model, layer),
-        "ast": ("MIT/ast-finetuned-audioset-10-10-0.4593", ASTModel, layer),
-    }
-# Store loaded models globally to properly manage them
-_loaded_models = {}
-def load_model(name, layer, max_gpus=None):
-    global _loaded_models
-    # Clean up any previously loaded models first
-    if _loaded_models:
-        for key, model_data in _loaded_models.items():
-            if isinstance(model_data, tuple) and len(model_data) == 2:
-                if isinstance(model_data[0], BalancedDualGPUModel):
-                    model_data[0].cleanup()
-                elif isinstance(model_data[0], tuple):
-                    # Single GPU model
-                    _, model = model_data[0]
-                    del model
-        _loaded_models.clear()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        gc.collect()
-    if name.lower() in {"raw", "waveform"}:
-        return "raw", layer
-    ngpu = get_gpu_count(max_gpus)
-    # Only use BalancedDualGPUModel if we have multiple GPUs
-    if ngpu > 1:
-        model = BalancedDualGPUModel(name, layer, max_gpus)
-        _loaded_models[name] = (model, layer)
-        return model, layer
-    else:
-        ckpt, cls, layer_eff = get_model_config(layer)[name]
-        if cls is ASTModel:
-            extractor = AutoFeatureExtractor.from_pretrained(ckpt)
-        else:
-            extractor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt)
-        device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        attn_impl = "eager" if cls in (WavLMModel, ASTModel) else "sdpa"
-        # CRITICAL FIX: Always use float32 for CPU compatibility
-        model = cls.from_pretrained(
-            ckpt,
-            output_hidden_states=True,
-            use_safetensors=True,
-            torch_dtype=torch.float32,  # Changed from float16 to float32
-            low_cpu_mem_usage=True,
-            attn_implementation=attn_impl
-        )
-        model.eval()
-        model = model.to(device)
-        for param in model.parameters():
-            param.requires_grad = False
-        model_tuple = ((extractor, model), layer_eff)
-        _loaded_models[name] = model_tuple
-        return (extractor, model), layer_eff
-def cleanup_all_models():
-    """Call this at the end of each experiment to ensure complete cleanup"""
-    global _loaded_models
-    if _loaded_models:
-        for key, model_data in _loaded_models.items():
-            if isinstance(model_data, tuple) and len(model_data) == 2:
-                if isinstance(model_data[0], BalancedDualGPUModel):
-                    model_data[0].cleanup()
-                elif isinstance(model_data[0], tuple):
-                    # Single GPU model
-                    _, model = model_data[0]
-                    del model
-        _loaded_models.clear()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-def embed_batch_raw(signals, masks_audio):
-    win = int(ENERGY_WIN_MS * SR / 1000)
-    hop = int(ENERGY_HOP_MS * SR / 1000)
-    reps, L_max = [], 0
-    for sig_np, mask_np in zip(signals, masks_audio):
-        x = torch.as_tensor(sig_np[:-1], dtype=torch.float32)
-        frames = x.unfold(0, win, hop)
-        mask = torch.as_tensor(mask_np[: len(frames)], dtype=torch.bool)
-        keep = frames[mask] if mask.any() else frames[:1]
-        reps.append(keep)
-        L_max = max(L_max, keep.size(0))
-    reps = [F.pad(r, (0, 0, 0, L_max - r.size(0))) for r in reps]
-    return torch.stack(reps, dim=0)
-def embed_batch_single_gpu(
-        signals, masks_audio, extractor, model, layer, use_mlm=False
-):
-    if not signals:
-        return torch.empty(0, 0, 0)
-    device = next(model.parameters()).device
-    is_cuda = device.type == 'cuda'
-    max_batch = 2
-    all_keeps = []
-    for i in range(0, len(signals), max_batch):
-        batch_signals = signals[i:i + max_batch]
-        batch_masks = masks_audio[i:i + max_batch]
-        inputs = extractor(batch_signals, sampling_rate=SR, return_tensors="pt", padding=True)
-        input_values = inputs.input_values.to(device, non_blocking=is_cuda)
-        orig_mode = model.training
-        model.train() if use_mlm else model.eval()
-        with torch.no_grad():
-            # CRITICAL FIX: Don't use autocast on CPU
-            if is_cuda:
-                # On GPU, we can use autocast with float16 for speed
-                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
-                    hs = model(input_values, output_hidden_states=True).hidden_states[layer]
-            else:
-                # On CPU, just run the model directly without autocast
-                hs = model(input_values, output_hidden_states=True).hidden_states[layer]
-        model.train(orig_mode)
-        B, T, D = hs.shape
-        for b in range(B):
-            mask_b = batch_masks[b].float().unsqueeze(0).unsqueeze(0).to(device)
-            mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
-            all_keeps.append(hs[b][mask_t].cpu())
-        # Aggressive cleanup
-        del hs, input_values, inputs
-        if is_cuda:
-            torch.cuda.empty_cache()
-    if all_keeps:
-        L_max = max(x.shape[0] for x in all_keeps)
-        keep_padded = [F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in all_keeps]
-        result = torch.stack(keep_padded, dim=0)
-        # Clean up intermediate lists
-        del all_keeps, keep_padded
-        return result
-    else:
-        return torch.empty(0, 0, 0)
-def embed_batch(signals, masks_audio, model_wrapper, layer, use_mlm=False):
-    if model_wrapper == "raw":
-        return embed_batch_raw(signals, masks_audio)
-    if isinstance(model_wrapper, BalancedDualGPUModel):
-        all_embeddings = []
-        batch_size = min(BATCH_SIZE, 2)
-        for i in range(0, len(signals), batch_size):
-            batch_emb = model_wrapper.process_batch(
-                signals[i: i + batch_size], masks_audio[i: i + batch_size], use_mlm
-            )
-            if batch_emb.numel() > 0:
-                all_embeddings.append(batch_emb)
-            # Clear cache after each batch
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        if all_embeddings:
-            result = torch.cat(all_embeddings, dim=0)
-            del all_embeddings
-            return result
-        else:
-            return torch.empty(0, 0, 0)
-    else:
-        extractor, model = model_wrapper
-        return embed_batch_single_gpu(
-            signals, masks_audio, extractor, model, layer, use_mlm=use_mlm
         )

+import queue
+import threading
+import gc
+import torch
+import torch.nn.functional as F
+from transformers import (
+    HubertModel,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Model,
+    WavLMModel,
+)
+from config import BATCH_SIZE, ENERGY_HOP_MS, ENERGY_WIN_MS, SR
+from utils import get_gpu_count
+class BalancedDualGPUModel:
+    def __init__(self, model_name, layer, max_gpus=None):
+        self.layer = layer
+        self.models = []
+        self.extractors = []
+        self.devices = []
+        ngpu = get_gpu_count(max_gpus)
+        for gpu_id in range(min(ngpu, 2)):
+            device = f"cuda:{gpu_id}"
+            self.devices.append(device)
+            ckpt, cls, _ = get_model_config(layer)[model_name]
+            extractor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt)
+            attn_impl = "eager" if cls is WavLMModel else "sdpa"
+            model = cls.from_pretrained(
+                ckpt,
+                output_hidden_states=True,
+                use_safetensors=True,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                attn_implementation=attn_impl
+            )
+            model.eval()
+            model = model.to(device)
+            for param in model.parameters():
+                param.requires_grad = False
+            self.extractors.append(extractor)
+            self.models.append(model)
+        self.gpu_queues = [queue.Queue() for _ in range(len(self.devices))]
+        self.result_queue = queue.Queue()
+        self.workers = []
+        for i in range(len(self.devices)):
+            worker = threading.Thread(target=self._gpu_worker, args=(i,))
+            worker.daemon = True
+            worker.start()
+            self.workers.append(worker)
+    def _gpu_worker(self, gpu_id):
+        device = self.devices[gpu_id]
+        model = self.models[gpu_id]
+        extractor = self.extractors[gpu_id]
+        while True:
+            task = self.gpu_queues[gpu_id].get()
+            if task is None:
+                break
+            signals, masks, use_mlm, task_id = task
+            try:
+                inputs = extractor(
+                    signals, sampling_rate=SR, return_tensors="pt", padding=True
+                )
+                input_values = inputs.input_values.to(device, non_blocking=True)
+                torch.cuda.empty_cache()
+                orig_mode = model.training
+                model.train() if use_mlm else model.eval()
+                with torch.no_grad():
+                    with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                        hs = model(
+                            input_values, output_hidden_states=True
+                        ).hidden_states[self.layer]
+                model.train(orig_mode)
+                B, T, D = hs.shape
+                keep = []
+                for b in range(B):
+                    mask_b = masks[b].float().unsqueeze(0).unsqueeze(0).to(device)
+                    mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
+                    keep.append(hs[b][mask_t].cpu())
+                # Aggressive cleanup
+                del hs, input_values, inputs
+                torch.cuda.empty_cache()
+                if keep:
+                    L_max = max(x.shape[0] for x in keep)
+                    keep_padded = [
+                        F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in keep
+                    ]
+                    result = torch.stack(keep_padded, dim=0)
+                else:
+                    result = torch.empty(0, 0, 0)
+                self.result_queue.put((task_id, result))
+            except Exception as e:
+                self.result_queue.put((task_id, e))
+            finally:
+                # Always clear cache after processing
+                torch.cuda.empty_cache()
+    def process_batch(self, signals, masks, use_mlm=False):
+        if not signals:
+            return torch.empty(0, 0, 0)
+        batch_size = len(signals)
+        split = (batch_size + len(self.devices) - 1) // len(self.devices)
+        results = {}
+        task_id = 0
+        for i in range(0, batch_size, split):
+            end = min(i + split, batch_size)
+            gpu_id = (i // split) % len(self.devices)
+            self.gpu_queues[gpu_id].put(
+                (signals[i:end], masks[i:end], use_mlm, task_id)
+            )
+            task_id += 1
+        for _ in range(task_id):
+            tid, result = self.result_queue.get()
+            if isinstance(result, Exception):
+                raise result
+            results[tid] = result
+        parts = [results[i] for i in range(task_id) if results[i].numel() > 0]
+        return torch.cat(parts, dim=0) if parts else torch.empty(0, 0, 0)
+    def cleanup(self):
+        """Explicit cleanup method"""
+        for q in self.gpu_queues:
+            q.put(None)
+        for w in self.workers:
+            w.join(timeout=5.0)
+        for model in self.models:
+            del model
+        for extractor in self.extractors:
+            del extractor
+        self.models.clear()
+        self.extractors.clear()
+        torch.cuda.empty_cache()
+        gc.collect()
+    def __del__(self):
+        self.cleanup()
+# NO CACHE - we need to clean up models properly between runs
+def get_model_config(layer):
+    return {
+        "raw": (None, None, None),
+        "wavlm": ("microsoft/wavlm-large", WavLMModel, layer),
+        "wav2vec2": ("facebook/wav2vec2-large-lv60", Wav2Vec2Model, layer),
+        "hubert": ("facebook/hubert-large-ll60k", HubertModel, layer),
+        "wavlm_base": ("microsoft/wavlm-base", WavLMModel, layer),
+        "wav2vec2_base": ("facebook/wav2vec2-base", Wav2Vec2Model, layer),
+        "hubert_base": ("facebook/hubert-base-ls960", HubertModel, layer),
+        "wav2vec2_xlsr": ("facebook/wav2vec2-large-xlsr-53", Wav2Vec2Model, layer),
+    }
+# Store loaded models globally to properly manage them
+_loaded_models = {}
+def load_model(name, layer, max_gpus=None):
+    global _loaded_models
+    # Clean up any previously loaded models first
+    if _loaded_models:
+        for key, model_data in _loaded_models.items():
+            if isinstance(model_data, tuple) and len(model_data) == 2:
+                if isinstance(model_data[0], BalancedDualGPUModel):
+                    model_data[0].cleanup()
+                elif isinstance(model_data[0], tuple):
+                    # Single GPU model
+                    _, model = model_data[0]
+                    del model
+        _loaded_models.clear()
+        torch.cuda.empty_cache()
+        gc.collect()
+    if name.lower() in {"raw", "waveform"}:
+        return "raw", layer
+    ngpu = get_gpu_count(max_gpus)
+    if ngpu > 1:
+        model = BalancedDualGPUModel(name, layer, max_gpus)
+        _loaded_models[name] = (model, layer)
+        return model, layer
+    else:
+        ckpt, cls, layer_eff = get_model_config(layer)[name]
+        extractor = Wav2Vec2FeatureExtractor.from_pretrained(ckpt)
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        attn_impl = "eager" if cls is WavLMModel else "sdpa"
+        model = cls.from_pretrained(
+            ckpt,
+            output_hidden_states=True,
+            use_safetensors=True,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation=attn_impl
+        )
+        model.eval()
+        model = model.to(device)
+        for param in model.parameters():
+            param.requires_grad = False
+        model_tuple = ((extractor, model), layer_eff)
+        _loaded_models[name] = model_tuple
+        return (extractor, model), layer_eff
+def cleanup_all_models():
+    """Call this at the end of each experiment to ensure complete cleanup"""
+    global _loaded_models
+    if _loaded_models:
+        for key, model_data in _loaded_models.items():
+            if isinstance(model_data, tuple) and len(model_data) == 2:
+                if isinstance(model_data[0], BalancedDualGPUModel):
+                    model_data[0].cleanup()
+                elif isinstance(model_data[0], tuple):
+                    # Single GPU model
+                    _, model = model_data[0]
+                    del model
+        _loaded_models.clear()
+    torch.cuda.empty_cache()
+    gc.collect()
+def embed_batch_raw(signals, masks_audio):
+    win = int(ENERGY_WIN_MS * SR / 1000)
+    hop = int(ENERGY_HOP_MS * SR / 1000)
+    reps, L_max = [], 0
+    for sig_np, mask_np in zip(signals, masks_audio):
+        x = torch.as_tensor(sig_np[:-1], dtype=torch.float32)
+        frames = x.unfold(0, win, hop)
+        mask = torch.as_tensor(mask_np[: len(frames)], dtype=torch.bool)
+        keep = frames[mask] if mask.any() else frames[:1]
+        reps.append(keep)
+        L_max = max(L_max, keep.size(0))
+    reps = [F.pad(r, (0, 0, 0, L_max - r.size(0))) for r in reps]
+    return torch.stack(reps, dim=0)
+def embed_batch_single_gpu(
+        signals, masks_audio, extractor, model, layer, use_mlm=False
+):
+    if not signals:
+        return torch.empty(0, 0, 0)
+    device = next(model.parameters()).device
+    max_batch = 2
+    all_keeps = []
+    for i in range(0, len(signals), max_batch):
+        batch_signals = signals[i:i + max_batch]
+        batch_masks = masks_audio[i:i + max_batch]
+        inputs = extractor(batch_signals, sampling_rate=SR, return_tensors="pt", padding=True)
+        input_values = inputs.input_values.to(device, non_blocking=True)
+        orig_mode = model.training
+        model.train() if use_mlm else model.eval()
+        with torch.no_grad():
+            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                hs = model(input_values, output_hidden_states=True).hidden_states[layer]
+        model.train(orig_mode)
+        B, T, D = hs.shape
+        for b in range(B):
+            mask_b = batch_masks[b].float().unsqueeze(0).unsqueeze(0).to(device)
+            mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
+            all_keeps.append(hs[b][mask_t].cpu())
+        # Aggressive cleanup
+        del hs, input_values, inputs
+        torch.cuda.empty_cache()
+    if all_keeps:
+        L_max = max(x.shape[0] for x in all_keeps)
+        keep_padded = [F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in all_keeps]
+        result = torch.stack(keep_padded, dim=0)
+        # Clean up intermediate lists
+        del all_keeps, keep_padded
+        return result
+    else:
+        return torch.empty(0, 0, 0)
+def embed_batch(signals, masks_audio, model_wrapper, layer, use_mlm=False):
+    if model_wrapper == "raw":
+        return embed_batch_raw(signals, masks_audio)
+    if isinstance(model_wrapper, BalancedDualGPUModel):
+        all_embeddings = []
+        batch_size = min(BATCH_SIZE, 2)
+        for i in range(0, len(signals), batch_size):
+            batch_emb = model_wrapper.process_batch(
+                signals[i: i + batch_size], masks_audio[i: i + batch_size], use_mlm
+            )
+            if batch_emb.numel() > 0:
+                all_embeddings.append(batch_emb)
+            # Clear cache after each batch
+            torch.cuda.empty_cache()
+        if all_embeddings:
+            result = torch.cat(all_embeddings, dim=0)
+            del all_embeddings
+            return result
+        else:
+            return torch.empty(0, 0, 0)
+    else:
+        extractor, model = model_wrapper
+        return embed_batch_single_gpu(
+            signals, masks_audio, extractor, model, layer, use_mlm=use_mlm
         )