Spaces:

JacobLinCool
/

tc5-exp

Sleeping

JacobLinCool commited on Jun 6

Commit

812b01c

1 Parent(s): 7948b62

Implement TaikoConformer7 model, loss function, preprocessing, and training pipeline

- Added TaikoLoss class for custom loss calculation with NPS penalties.
- Developed TaikoConformer7 model architecture using Conformer and CNN layers.
- Created preprocessing functions to handle audio data and generate labels.
- Implemented training script with data loading, model training, and validation.
- Integrated TensorBoard logging for loss and energy comparisons during training.
- Added support for sliding NPS labels in preprocessing and loss calculation.

Files changed (27) hide show

.gitignore +1 -0
app.py +300 -0
requirements.txt +14 -0
tc5/__init__.py +0 -0
tc5/config.py +25 -0
tc5/dataset.py +21 -0
tc5/infer.py +356 -0
tc5/loss.py +65 -0
tc5/model.py +133 -0
tc5/preprocess.py +215 -0
tc5/train.py +323 -0
tc6/__init__.py +0 -0
tc6/config.py +25 -0
tc6/dataset.py +21 -0
tc6/infer.py +354 -0
tc6/loss.py +65 -0
tc6/model.py +166 -0
tc6/preprocess.py +258 -0
tc6/train.py +336 -0
tc7/__init__.py +0 -0
tc7/config.py +27 -0
tc7/dataset.py +21 -0
tc7/infer.py +354 -0
tc7/loss.py +94 -0
tc7/model.py +166 -0
tc7/preprocess.py +400 -0
tc7/train.py +300 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,300 @@

+import gradio as gr
+import torch
+from tc5.config import SAMPLE_RATE, HOP_LENGTH
+from tc5.model import TaikoConformer5
+from tc5 import infer as tc5infer
+from tc6.model import TaikoConformer6
+from tc6 import infer as tc6infer
+from tc7.model import TaikoConformer7
+from tc7 import infer as tc7infer
+from gradio_client import Client, handle_file
+import tempfile
+DEVICE = torch.device("cpu")
+# Load model once
+tc5 = TaikoConformer5.from_pretrained("JacobLinCool/taiko-conformer-5")
+tc5.to(DEVICE)
+tc5.eval()
+# Load TC6 model
+tc6 = TaikoConformer6.from_pretrained("JacobLinCool/taiko-conformer-6")
+tc6.to(DEVICE)
+tc6.eval()
+# Load TC7 model
+tc7 = TaikoConformer7.from_pretrained("JacobLinCool/taiko-conformer-7")
+tc7.to(DEVICE)
+tc7.eval()
+synthesizer = Client("ryanlinjui/taiko-music-generator")
+def infer_tc5(audio, nps, bpm):
+    audio_path = audio
+    filename = audio_path.split("/")[-1]
+    # Preprocess
+    mel_input, nps_input = tc5infer.preprocess_audio(audio_path, nps)
+    # Inference
+    don_energy, ka_energy, drumroll_energy = tc5infer.run_inference(
+        tc5, mel_input, nps_input, DEVICE
+    )
+    output_frame_hop_sec = HOP_LENGTH / SAMPLE_RATE
+    onsets = tc5infer.decode_onsets(
+        don_energy,
+        ka_energy,
+        drumroll_energy,
+        output_frame_hop_sec,
+        threshold=0.3,
+        min_distance_frames=3,
+    )
+    # Generate plot
+    plot = tc5infer.plot_results(
+        mel_input,
+        don_energy,
+        ka_energy,
+        drumroll_energy,
+        onsets,
+        output_frame_hop_sec,
+    )
+    # Generate TJA content
+    tja_content = tc5infer.write_tja(onsets, bpm=bpm, audio=filename)
+    # wrtie TJA content to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".tja") as temp_tja_file:
+        temp_tja_file.write(tja_content.encode("utf-8"))
+        tja_path = temp_tja_file.name
+    result = synthesizer.predict(
+        param_0=handle_file(tja_path),
+        param_1=handle_file(audio_path),
+        param_2="達人譜面 / Master",
+        param_3=16,
+        param_4=5,
+        param_5=5,
+        param_6=5,
+        param_7=5,
+        param_8=5,
+        param_9=5,
+        param_10=5,
+        param_11=5,
+        param_12=5,
+        param_13=5,
+        param_14=5,
+        param_15=5,
+        api_name="/handle",
+    )
+    oni_audio = result[1]
+    return oni_audio, plot, tja_content
+def infer_tc6(audio, nps, bpm, difficulty, level):
+    audio_path = audio
+    filename = audio_path.split("/")[-1]
+    # Preprocess
+    mel_input = tc6infer.preprocess_audio(audio_path)
+    nps_input = torch.tensor(nps, dtype=torch.float32).to(DEVICE)
+    difficulty_input = torch.tensor(difficulty, dtype=torch.float32).to(DEVICE)
+    level_input = torch.tensor(level, dtype=torch.float32).to(DEVICE)
+    # Inference
+    don_energy, ka_energy, drumroll_energy = tc6infer.run_inference(
+        tc6, mel_input, nps_input, difficulty_input, level_input, DEVICE
+    )
+    output_frame_hop_sec = HOP_LENGTH / SAMPLE_RATE
+    onsets = tc6infer.decode_onsets(
+        don_energy,
+        ka_energy,
+        drumroll_energy,
+        output_frame_hop_sec,
+        threshold=0.3,
+        min_distance_frames=3,
+    )
+    # Generate plot
+    plot = tc6infer.plot_results(
+        mel_input,
+        don_energy,
+        ka_energy,
+        drumroll_energy,
+        onsets,
+        output_frame_hop_sec,
+    )
+    # Generate TJA content
+    tja_content = tc6infer.write_tja(onsets, bpm=bpm, audio=filename)
+    # wrtie TJA content to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".tja") as temp_tja_file:
+        temp_tja_file.write(tja_content.encode("utf-8"))
+        tja_path = temp_tja_file.name
+    result = synthesizer.predict(
+        param_0=handle_file(tja_path),
+        param_1=handle_file(audio_path),
+        param_2="達人譜面 / Master",
+        param_3=16,
+        param_4=5,
+        param_5=5,
+        param_6=5,
+        param_7=5,
+        param_8=5,
+        param_9=5,
+        param_10=5,
+        param_11=5,
+        param_12=5,
+        param_13=5,
+        param_14=5,
+        param_15=5,
+        api_name="/handle",
+    )
+    oni_audio = result[1]
+    return oni_audio, plot, tja_content
+def infer_tc7(audio, nps, bpm, difficulty, level):
+    audio_path = audio
+    filename = audio_path.split("/")[-1]
+    # Preprocess
+    mel_input = tc7infer.preprocess_audio(audio_path)
+    nps_input = torch.tensor(nps, dtype=torch.float32).to(DEVICE)
+    difficulty_input = torch.tensor(difficulty, dtype=torch.float32).to(DEVICE)
+    level_input = torch.tensor(level, dtype=torch.float32).to(DEVICE)
+    # Inference
+    don_energy, ka_energy, drumroll_energy = tc7infer.run_inference(
+        tc7, mel_input, nps_input, difficulty_input, level_input, DEVICE
+    )
+    output_frame_hop_sec = HOP_LENGTH / SAMPLE_RATE
+    onsets = tc7infer.decode_onsets(
+        don_energy,
+        ka_energy,
+        drumroll_energy,
+        output_frame_hop_sec,
+        threshold=0.3,
+        min_distance_frames=3,
+    )
+    # Generate plot
+    plot = tc7infer.plot_results(
+        mel_input,
+        don_energy,
+        ka_energy,
+        drumroll_energy,
+        onsets,
+        output_frame_hop_sec,
+    )
+    # Generate TJA content
+    tja_content = tc7infer.write_tja(onsets, bpm=bpm, audio=filename)
+    # wrtie TJA content to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".tja") as temp_tja_file:
+        temp_tja_file.write(tja_content.encode("utf-8"))
+        tja_path = temp_tja_file.name
+    result = synthesizer.predict(
+        param_0=handle_file(tja_path),
+        param_1=handle_file(audio_path),
+        param_2="達人譜面 / Master",
+        param_3=16,
+        param_4=5,
+        param_5=5,
+        param_6=5,
+        param_7=5,
+        param_8=5,
+        param_9=5,
+        param_10=5,
+        param_11=5,
+        param_12=5,
+        param_13=5,
+        param_14=5,
+        param_15=5,
+        api_name="/handle",
+    )
+    oni_audio = result[1]
+    return oni_audio, plot, tja_content
+def run_inference(audio, model_choice, nps, bpm, difficulty, level):
+    if model_choice == "TC5":
+        return infer_tc5(audio, nps, bpm)
+    elif model_choice == "TC6":
+        return infer_tc6(audio, nps, bpm, difficulty, level)
+    else:  # TC7
+        return infer_tc7(audio, nps, bpm, difficulty, level)
+with gr.Blocks() as demo:
+    gr.Markdown("# Taiko Conformer 5/7 Demo")
+    with gr.Row():
+        audio_input = gr.Audio(sources="upload", type="filepath", label="Input Audio")
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            choices=["TC5", "TC6", "TC7"],
+            value="TC7",
+            label="Model Selection",
+            info="Choose between TaikoConformer 5, 6 or 7",
+        )
+    with gr.Row():
+        nps = gr.Slider(
+            value=5.0,
+            minimum=0.5,
+            maximum=11.0,
+            step=0.5,
+            label="NPS (Notes Per Second)",
+        )
+        bpm = gr.Slider(
+            value=240,
+            minimum=160,
+            maximum=640,
+            step=1,
+            label="BPM (Used by TJA Quantization)",
+        )
+    with gr.Row():
+        difficulty = gr.Slider(
+            value=3.0,
+            minimum=1.0,
+            maximum=3.0,
+            step=1.0,
+            label="Difficulty",
+            visible=False,
+            info="1=Normal, 2=Hard, 3=Oni",
+        )
+        level = gr.Slider(
+            value=8.0,
+            minimum=1.0,
+            maximum=10.0,
+            step=1.0,
+            label="Level",
+            visible=False,
+            info="Difficulty level from 1 to 10",
+        )
+    audio_output = gr.Audio(label="Generated Audio", type="filepath")
+    plot_output = gr.Plot(label="Onset/Energy Plot")
+    tja_output = gr.Textbox(label="TJA File Content", show_copy_button=True)
+    run_btn = gr.Button("Run Inference")
+    # Update visibility of TC7-specific controls based on model selection
+    def update_visibility(model_choice):
+        if model_choice == "TC7" or model_choice == "TC6":
+            return gr.update(visible=True), gr.update(visible=True)
+        else:
+            return gr.update(visible=False), gr.update(visible=False)
+    model_choice.change(
+        update_visibility, inputs=[model_choice], outputs=[difficulty, level]
+    )
+    run_btn.click(
+        run_inference,
+        inputs=[audio_input, model_choice, nps, bpm, difficulty, level],
+        outputs=[audio_output, plot_output, tja_output],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch
+torchaudio
+datasets
+huggingface_hub
+librosa
+soundfile
+matplotlib
+tensorboard
+black
+tqdm
+safetensors
+accelerate
+tja
+spaces

tc5/__init__.py ADDED Viewed

File without changes

tc5/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+# ─── 1) CONFIG ─────────────────────────────────────────────────────
+SAMPLE_RATE = 22050
+N_MELS = 80
+HOP_LENGTH = 256  # ~86 fps
+TIME_SUB = 1
+CNN_CH = 128
+N_HEADS = 4
+D_MODEL = 256
+FF_DIM = 512
+N_LAYERS = 4
+DEPTHWISE_CONV_KERNEL_SIZE = 31
+DROPOUT = 0.1
+HIDDEN_DIM = 64
+N_TYPES = 7
+BATCH_SIZE = 4
+GRAD_ACCUM_STEPS = 4
+LR = 3e-4
+EPOCHS = 30
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)

tc5/dataset.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from datasets import load_dataset, concatenate_datasets
+# ds1 = load_dataset("JacobLinCool/taiko-2023-1.1", split="train")
+# ds2 = load_dataset("JacobLinCool/taiko-2023-1.2", split="train")
+# ds3 = load_dataset("JacobLinCool/taiko-2023-1.3", split="train")
+# ds4 = load_dataset("JacobLinCool/taiko-2023-1.4", split="train")
+# ds5 = load_dataset("JacobLinCool/taiko-2023-1.5", split="train")
+# ds6 = load_dataset("JacobLinCool/taiko-2023-1.6", split="train")
+# ds7 = load_dataset("JacobLinCool/taiko-2023-1.7", split="train")
+# ds = concatenate_datasets([ds1, ds2, ds3, ds4, ds5, ds6, ds7]).with_format("torch")
+# good = list(range(len(ds)))
+# good.remove(1079)  # 1079 has file problem
+# ds = ds.select(good)
+# for local test
+ds = (
+    load_dataset("JacobLinCool/taiko-2023-1.6", split="train")
+    .with_format("torch")
+    .select(range(10))
+)

tc5/infer.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import time
+import torch
+import torchaudio
+import matplotlib.pyplot as plt
+import numpy as np
+from .config import SAMPLE_RATE, N_MELS, HOP_LENGTH
+import torch.profiler
+# --- PREPROCESSING (match training) ---
+def preprocess_audio(audio_path, nps=5.0):
+    wav, sr = torchaudio.load(audio_path)
+    wav = wav.mean(dim=0)  # mono
+    if sr != SAMPLE_RATE:
+        wav = torchaudio.functional.resample(wav, sr, SAMPLE_RATE)
+    wav = wav / (wav.abs().max() + 1e-8)  # Normalize audio
+    nps_tensor = torch.tensor(nps, dtype=torch.float32)
+    mel_transform = torchaudio.transforms.MelSpectrogram(
+        sample_rate=SAMPLE_RATE,
+        n_mels=N_MELS,
+        hop_length=HOP_LENGTH,
+        n_fft=2048,
+    )
+    mel = mel_transform(wav)
+    # mel shape is (n_mels, T_mel), unsqueeze for batch later in run_inference
+    return mel, nps_tensor  # mel is (N_MELS, T_mel)
+# --- INFERENCE ---
+def run_inference(model, mel_input, nps_input, device):
+    model.eval()
+    with torch.no_grad():
+        mel = mel_input.to(device).unsqueeze(0)  # (1, N_MELS, T_mel)
+        nps = nps_input.to(device).unsqueeze(0)  # (1,)
+        mel_cnn_input = mel.unsqueeze(1)  # (1, 1, N_MELS, T_mel)
+        conformer_lengths = torch.tensor(
+            [mel_cnn_input.shape[-1]], dtype=torch.long, device=device
+        )
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                *(
+                    [torch.profiler.ProfilerActivity.CUDA]
+                    if device.type == "cuda"
+                    else []
+                ),
+            ],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=False,
+            with_flops=True,
+        ) as prof:
+            out_dict = model(mel_cnn_input, conformer_lengths, nps)
+        print(
+            prof.key_averages().table(
+                sort_by=(
+                    "self_cuda_memory_usage"
+                    if device.type == "cuda"
+                    else "self_cpu_time_total"
+                ),
+                row_limit=20,
+            )
+        )
+        energies = out_dict["presence"].squeeze(0).cpu().numpy()
+        don_energy = energies[:, 0]
+        ka_energy = energies[:, 1]
+        drumroll_energy = energies[:, 2]
+    return don_energy, ka_energy, drumroll_energy
+# --- DECODE TO ONSETS ---
+def decode_onsets(
+    don_energy,
+    ka_energy,
+    drumroll_energy,
+    hop_sec,
+    threshold=0.5,
+    min_distance_frames=3,
+):
+    results = []
+    T_out = len(don_energy)
+    last_onset_frame = -min_distance_frames
+    for i in range(1, T_out - 1):  # Iterate considering neighbors for peak detection
+        if i < last_onset_frame + min_distance_frames:
+            continue
+        e_don, e_ka, e_drum = don_energy[i], ka_energy[i], drumroll_energy[i]
+        energies_at_i = {
+            1: e_don,
+            2: e_ka,
+            5: e_drum,
+        }  # Type mapping: 1:Don, 2:Ka, 5:Drumroll
+        # Find which energy is max and if it's a peak above threshold
+        # Sort by energy value descending to prioritize higher energy in case of ties for peak condition
+        sorted_types_by_energy = sorted(
+            energies_at_i.keys(), key=lambda x: energies_at_i[x], reverse=True
+        )
+        detected_this_frame = False
+        for onset_type in sorted_types_by_energy:
+            current_energy_series = None
+            if onset_type == 1:
+                current_energy_series = don_energy
+            elif onset_type == 2:
+                current_energy_series = ka_energy
+            elif onset_type == 5:
+                current_energy_series = drumroll_energy
+            energy_val = current_energy_series[i]
+            if (
+                energy_val > threshold
+                and energy_val > current_energy_series[i - 1]
+                and energy_val > current_energy_series[i + 1]
+            ):
+                # Check if this energy is the highest among the three at this frame
+                # This check is implicitly handled by iterating `sorted_types_by_energy`
+                # and breaking after the first detection.
+                results.append((i * hop_sec, onset_type))
+                last_onset_frame = i
+                detected_this_frame = True
+                break  # Only one onset type per frame
+    return results
+# --- VISUALIZATION ---
+def plot_results(
+    mel_spectrogram,
+    don_energy,
+    ka_energy,
+    drumroll_energy,
+    onsets,
+    hop_sec,
+    out_path=None,
+):
+    # mel_spectrogram is (N_MELS, T_mel)
+    T_mel = mel_spectrogram.shape[1]
+    T_out = len(don_energy)  # Length of energy arrays (model output time dimension)
+    # Time axes
+    time_axis_mel = np.arange(T_mel) * (HOP_LENGTH / SAMPLE_RATE)
+    # hop_sec for model output is (HOP_LENGTH * TIME_SUB) / SAMPLE_RATE
+    # However, the model output T_out is related to T_mel (input to CNN).
+    # If CNN does not change time dimension, T_out = T_mel.
+    # If TIME_SUB is used for label generation, T_out = T_mel / TIME_SUB.
+    # The `lengths` passed to conformer in `run_inference` is T_mel.
+    # The output of conformer `x` has shape (B, T, D_MODEL). This T is `lengths`.
+    # So, T_out from model is T_mel.
+    # The `hop_sec` for onsets should be based on the model output frame rate.
+    # If model output T_out corresponds to T_mel, then hop_sec for plotting energies is HOP_LENGTH / SAMPLE_RATE.
+    # The `hop_sec` passed to `decode_onsets` is (HOP_LENGTH * TIME_SUB) / SAMPLE_RATE.
+    # This seems inconsistent. Let's clarify: `TIME_SUB` is used in `preprocess.py` to determine `T_sub` for labels.
+    # The model's CNN output time dimension T_cnn is used to pad/truncate labels in `collate_fn`.
+    # In `model.py`, the CNN does not stride in time. So T_cnn_out = T_mel_input_to_CNN.
+    # The `lengths` for the conformer is based on this T_cnn_out.
+    # So, the output of the regressor `presence` (B, T_cnn_out, 3) has T_cnn_out time steps.
+    # Each step corresponds to `HOP_LENGTH / SAMPLE_RATE` seconds if TIME_SUB is not involved in downsampling features for the conformer.
+    # Let's assume the `hop_sec` used for `decode_onsets` is correct for interpreting model output frames.
+    time_axis_energies = np.arange(T_out) * hop_sec
+    fig, ax1 = plt.subplots(figsize=(100, 10))
+    # Plot Mel Spectrogram on ax1
+    mel_db = torchaudio.functional.amplitude_to_DB(
+        mel_spectrogram, multiplier=10.0, amin=1e-10, db_multiplier=0.0
+    )
+    img = ax1.imshow(
+        mel_db.numpy(),
+        aspect="auto",
+        origin="lower",
+        cmap="magma",
+        extent=[time_axis_mel[0], time_axis_mel[-1], 0, N_MELS],
+    )
+    ax1.set_title("Mel Spectrogram with Predicted Energies and Onsets")
+    ax1.set_xlabel("Time (s)")
+    ax1.set_ylabel("Mel Bin")
+    fig.colorbar(img, ax=ax1, format="%+2.0f dB")
+    # Create a second y-axis for energies
+    ax2 = ax1.twinx()
+    ax2.plot(time_axis_energies, don_energy, label="Don Energy", color="red")
+    ax2.plot(time_axis_energies, ka_energy, label="Ka Energy", color="blue")
+    ax2.plot(
+        time_axis_energies, drumroll_energy, label="Drumroll Energy", color="green"
+    )
+    ax2.set_ylabel("Energy")
+    ax2.set_ylim(0, 1.2)  # Assuming energies are somewhat normalized or bounded
+    # Overlay onsets from decode_onsets (t is already in seconds)
+    labeled_types = set()
+    # Group drumrolls into segments (reuse logic from write_tja)
+    drumroll_times = [t_sec for t_sec, typ in onsets if typ == 5]
+    drumroll_times.sort()
+    drumroll_segments = []
+    if drumroll_times:
+        seg_start = drumroll_times[0]
+        prev = drumroll_times[0]
+        for t in drumroll_times[1:]:
+            if t - prev <= hop_sec * 6:  # up to 5-frame gap
+                prev = t
+            else:
+                drumroll_segments.append((seg_start, prev))
+                seg_start = t
+                prev = t
+        drumroll_segments.append((seg_start, prev))
+    # Plot Don/Ka onsets as vertical lines
+    for t_sec, typ in onsets:
+        if typ == 5:
+            continue  # skip drumroll onsets
+        color_map = {1: "darkred", 2: "darkblue"}
+        label_map = {1: "Don Onset", 2: "Ka Onset"}
+        line_color = color_map.get(typ, "black")
+        line_label = label_map.get(typ, f"Type {typ} Onset")
+        if typ not in labeled_types:
+            ax1.axvline(
+                t_sec, color=line_color, linestyle="--", alpha=0.9, label=line_label
+            )
+            labeled_types.add(typ)
+        else:
+            ax1.axvline(t_sec, color=line_color, linestyle="--", alpha=0.9)
+    # Plot drumroll segments as shaded regions
+    for seg_start, seg_end in drumroll_segments:
+        ax1.axvspan(
+            seg_start,
+            seg_end + hop_sec,
+            color="green",
+            alpha=0.2,
+            label="Drumroll Segment" if "drumroll" not in labeled_types else None,
+        )
+        labeled_types.add("drumroll")
+    # Combine legends from both axes
+    lines, labels = ax1.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax2.legend(lines + lines2, labels + labels2, loc="upper right")
+    fig.tight_layout()
+    # Return plot as image buffer or save to file if path provided
+    if out_path:
+        plt.savefig(out_path)
+        print(f"Saved plot to {out_path}")
+        plt.close(fig)
+        return out_path
+    else:
+        # Return plot as in-memory buffer
+        return fig
+def write_tja(onsets, out_path=None, bpm=160, quantize=96, audio="audio.wav"):
+    # TJA types: 0:no note, 1:Don, 2:Ka, 3:BigDon, 4:BigKa, 5:DrumrollStart, 8:DrumrollEnd
+    # Model output types: 1:Don, 2:Ka, 5:Drumroll (interpreted as start/single)
+    sec_per_beat = 60 / bpm
+    beats_per_measure = 4  # Assuming 4/4 time signature
+    sec_per_measure = sec_per_beat * beats_per_measure
+    # Step 1: Map onsets to (measure_idx, slot, typ)
+    slot_events = []
+    for t, typ in onsets:
+        measure_idx = int(t // sec_per_measure)
+        t_in_measure = t % sec_per_measure
+        slot = int(round(t_in_measure / sec_per_measure * quantize))
+        if slot >= quantize:
+            slot = quantize - 1
+        slot_events.append((measure_idx, slot, typ))
+    # Step 2: Build measure/slot grid
+    if slot_events:
+        max_measure_idx = max(m for m, _, _ in slot_events)
+    else:
+        max_measure_idx = -1
+    measures = {i: [0] * quantize for i in range(max_measure_idx + 1)}
+    # Step 3: Place Don/Ka, collect drumrolls
+    drumroll_slots = set()
+    for m, s, typ in slot_events:
+        if typ in [1, 2]:
+            measures[m][s] = typ
+        elif typ == 5:
+            drumroll_slots.add((m, s))
+    # Step 4: Process drumrolls into contiguous regions, mark 5 (start) and 8 (end)
+    # Flatten all slots to a list of (measure, slot) sorted
+    drumroll_list = sorted(list(drumroll_slots))
+    # Group into contiguous regions (allowing a gap of 5 slots)
+    grouped = []
+    group = []
+    for ms in drumroll_list:
+        if not group:
+            group = [ms]
+        else:
+            last_m, last_s = group[-1]
+            m, s = ms
+            # Calculate slot distance, considering measure wrap
+            slot_dist = None
+            if m == last_m:
+                slot_dist = s - last_s
+            elif m == last_m + 1 and last_s <= quantize - 1:
+                slot_dist = (quantize - 1 - last_s) + s + 1
+            else:
+                slot_dist = None
+            # Allow gap of up to 5 slots (slot_dist <= 6)
+            if slot_dist is not None and 1 <= slot_dist <= 6:
+                group.append(ms)
+            else:
+                grouped.append(group)
+                group = [ms]
+    if group:
+        grouped.append(group)
+    # Mark 5 (start) and 8 (end) for each group
+    for region in grouped:
+        if len(region) == 1:
+            m, s = region[0]
+            measures[m][s] = 5
+            # Place 8 in next slot (or next measure if at end)
+            if s < quantize - 1:
+                measures[m][s + 1] = 8
+            elif m < max_measure_idx:
+                measures[m + 1][0] = 8
+        else:
+            m_start, s_start = region[0]
+            m_end, s_end = region[-1]
+            measures[m_start][s_start] = 5
+            measures[m_end][s_end] = 8
+            # Fill 0 for middle slots (already 0 by default)
+    # Step 5: Generate TJA content
+    tja_content = []
+    tja_content.append(f"TITLE:{audio} (TC5, {time.strftime('%Y-%m-%d %H:%M:%S')})")
+    tja_content.append(f"BPM:{bpm}")
+    tja_content.append(f"WAVE:{audio}")
+    tja_content.append("OFFSET:0")
+    tja_content.append("COURSE:Oni\nLEVEL:9\n")
+    tja_content.append("#START")
+    for i in range(max_measure_idx + 1):
+        notes = measures.get(i, [0] * quantize)
+        line = "".join(str(n) for n in notes)
+        tja_content.append(line + ",")
+    tja_content.append("#END")
+    tja_string = "\n".join(tja_content)
+    # If out_path is provided, also write to file
+    if out_path:
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write(tja_string)
+        print(f"TJA chart saved to {out_path}")
+    return tja_string

tc5/loss.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+class TaikoEnergyLoss(nn.Module):
+    def __init__(self, reduction="mean"):
+        super().__init__()
+        # Use 'none' reduction to get element-wise losses, then manually apply masking and reduction
+        self.mse_loss = nn.MSELoss(reduction="none")
+        self.reduction = reduction
+    def forward(self, outputs, batch):
+        """
+        Calculates the MSE loss for energy-based predictions.
+        Args:
+            outputs (dict): Model output, containing 'presence' tensor.
+                            outputs['presence'] shape: (B, T, 3) for don, ka, drumroll energies.
+            batch (dict): Batch data from collate_fn, containing true labels and lengths.
+                          batch['don_labels'], batch['ka_labels'], batch['drumroll_labels'] shape: (B, T)
+                          batch['lengths'] shape: (B,) - valid sequence lengths for time dimension T.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        pred_energies = outputs["presence"]  # (B, T, 3)
+        true_don = batch["don_labels"]  # (B, T)
+        true_ka = batch["ka_labels"]  # (B, T)
+        true_drumroll = batch["drumroll_labels"]  # (B, T)
+        # Stack true labels to match the structure of pred_energies (B, T, 3)
+        true_energies = torch.stack([true_don, true_ka, true_drumroll], dim=2)
+        B, T, _ = pred_energies.shape
+        # Create a mask based on batch['lengths'] to ignore padded parts of sequences
+        # batch['lengths'] gives the actual length of each sequence in the batch
+        # mask shape: (B, T)
+        mask_2d = torch.arange(T, device=pred_energies.device).expand(B, T) < batch[
+            "lengths"
+        ].unsqueeze(1)
+        # Expand mask to (B, T, 1) to broadcast across the 3 energy channels
+        mask_3d = mask_2d.unsqueeze(2)
+        # Calculate element-wise MSE loss
+        loss_elementwise = self.mse_loss(pred_energies, true_energies)  # (B, T, 3)
+        # Apply the mask to the loss
+        masked_loss = loss_elementwise * mask_3d
+        if self.reduction == "mean":
+            # Sum the loss over all valid (unmasked) elements and divide by the number of valid elements
+            total_loss = masked_loss.sum()
+            num_valid_elements = mask_3d.sum()  # Total number of unmasked float values
+            if num_valid_elements > 0:
+                return total_loss / num_valid_elements
+            else:
+                # Avoid division by zero if there are no valid elements (e.g., empty batch or all lengths are 0)
+                return torch.tensor(
+                    0.0, device=pred_energies.device, requires_grad=True
+                )
+        elif self.reduction == "sum":
+            return masked_loss.sum()
+        else:  # 'none' or any other case
+            return masked_loss

tc5/model.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import torch.nn as nn
+from torchaudio.models import Conformer
+from huggingface_hub import PyTorchModelHubMixin
+from .config import (
+    N_MELS,
+    CNN_CH,
+    N_HEADS,
+    D_MODEL,
+    FF_DIM,
+    N_LAYERS,
+    DROPOUT,
+    DEPTHWISE_CONV_KERNEL_SIZE,
+    HIDDEN_DIM,
+    DEVICE,
+)
+class TaikoConformer5(nn.Module, PyTorchModelHubMixin):
+    def __init__(self):
+        super().__init__()
+        # 1) CNN frontend: frequency-only pooling
+        self.cnn = nn.Sequential(
+            nn.Conv2d(1, CNN_CH, 3, stride=(2, 1), padding=1),
+            nn.BatchNorm2d(CNN_CH),
+            nn.GELU(),
+            nn.Dropout2d(DROPOUT),
+            nn.Conv2d(CNN_CH, CNN_CH, 3, stride=(2, 1), padding=1),
+            nn.BatchNorm2d(CNN_CH),
+            nn.GELU(),
+            nn.Dropout2d(DROPOUT),
+        )
+        feat_dim = CNN_CH * (N_MELS // 4)
+        # 2) Linear projection to model dimension
+        self.proj = nn.Linear(feat_dim, D_MODEL)
+        # 3) FiLM conditioning for notes_per_second
+        self.film = nn.Linear(1, 2 * D_MODEL)
+        # 4) Conformer encoder
+        self.encoder = Conformer(
+            input_dim=D_MODEL,
+            num_heads=N_HEADS,
+            ffn_dim=FF_DIM,
+            num_layers=N_LAYERS,
+            depthwise_conv_kernel_size=DEPTHWISE_CONV_KERNEL_SIZE,
+            dropout=DROPOUT,
+            use_group_norm=False,
+            convolution_first=False,
+        )
+        # 5) Presence regressor head
+        self.presence_regressor = nn.Sequential(
+            nn.Dropout(DROPOUT),
+            nn.Linear(D_MODEL, HIDDEN_DIM),
+            nn.GELU(),
+            nn.Dropout(DROPOUT),
+            nn.Linear(HIDDEN_DIM, 3),  # Don, Ka, DrumRoll energy
+            nn.Sigmoid(),  # Output between 0 and 1
+        )
+        # 6) Initialize weights
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def forward(
+        self, mel: torch.Tensor, lengths: torch.Tensor, notes_per_second: torch.Tensor
+    ):
+        """
+        Args:
+            mel: (B, 1, N_MELS, T_mel)
+            lengths: (B,) lengths after CNN
+            notes_per_second: (B,) stream of control values
+        Returns:
+            Dict with:
+                'presence': (B, T_cnn_out, 4)
+                'lengths': lengths
+        """
+        # CNN frontend
+        x = self.cnn(mel)  # (B, C, F, T)
+        B, C, F, T = x.size()
+        x = x.permute(0, 3, 1, 2).reshape(B, T, C * F)
+        # Project to model dimension
+        x = self.proj(x)  # (B, T, D_MODEL)
+        # FiLM conditioning
+        nps = notes_per_second.unsqueeze(-1)  # (B, 1)
+        gamma_beta = self.film(nps)  # (B, 2*D_MODEL)
+        gamma, beta = gamma_beta.chunk(2, dim=-1)
+        x = gamma.unsqueeze(1) * x + beta.unsqueeze(1)
+        # Conformer encoder
+        x, _ = self.encoder(x, lengths=lengths)
+        # Presence prediction
+        presence = self.presence_regressor(x)
+        return {"presence": presence, "lengths": lengths}
+if __name__ == "__main__":
+    model = TaikoConformer5().to(device=DEVICE)
+    print(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(f"{name}: {param.numel():,}")
+    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {params / 1e6:.2f}M")
+    batch_size = 4
+    mel_time_steps = 1024
+    input_mel = torch.randn(batch_size, 1, N_MELS, mel_time_steps).to(DEVICE)
+    conformer_lengths = torch.tensor(
+        [mel_time_steps] * batch_size, dtype=torch.long
+    ).to(DEVICE)
+    notes_per_second_input = torch.tensor([10.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )
+    output = model(input_mel, conformer_lengths, notes_per_second_input)
+    print("Output shapes:")
+    for key, value in output.items():
+        print(f"{key}: {value.shape}")

tc5/preprocess.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import math
+import torch
+import torch.nn as nn
+import torchaudio
+from torchaudio.transforms import FrequencyMasking
+from .config import N_TYPES, SAMPLE_RATE, N_MELS, HOP_LENGTH, TIME_SUB
+from .model import TaikoConformer5
+mel_transform = torchaudio.transforms.MelSpectrogram(
+    sample_rate=SAMPLE_RATE,
+    n_mels=N_MELS,
+    hop_length=HOP_LENGTH,
+    n_fft=2048,
+)
+freq_mask = FrequencyMasking(freq_mask_param=15)
+def preprocess(example, difficulty="oni"):
+    wav_tensor = example["audio"]["array"]
+    sr = example["audio"]["sampling_rate"]
+    # 1) load & resample
+    if sr != SAMPLE_RATE:
+        wav_tensor = torchaudio.functional.resample(wav_tensor, sr, SAMPLE_RATE)
+    # normalize audio
+    wav_tensor = wav_tensor / (wav_tensor.abs().max() + 1e-8)
+    # add random Gaussian noise
+    if torch.rand(1).item() < 0.5:
+        wav_tensor = wav_tensor + 0.005 * torch.randn_like(wav_tensor)
+    # 2) mel: (1, N_MELS, T)
+    mel = mel_transform(wav_tensor).unsqueeze(0)
+    # apply SpecAugment
+    # we don't use time masking since we don't want model to predict notes when they are masked
+    mel = freq_mask(mel)
+    _, _, T = mel.shape
+    # 3) build label sequence of length ceil(T / TIME_SUB)
+    T_sub = math.ceil(T / TIME_SUB)
+    # Initialize energy-based labels for Don, Ka, Drumroll
+    don_labels = torch.zeros(T_sub, dtype=torch.float32)
+    ka_labels = torch.zeros(T_sub, dtype=torch.float32)
+    drumroll_labels = torch.zeros(T_sub, dtype=torch.float32)
+    # Define exponential decay tail parameters
+    tail_length = 40  # number of frames for decay tail
+    decay_rate = 8.0  # decay rate parameter, adjust as needed
+    tail_kernel = torch.exp(
+        -torch.arange(0, tail_length, dtype=torch.float32) / decay_rate
+    )
+    fps = SAMPLE_RATE / HOP_LENGTH
+    num_valid_notes = 0
+    for onset in example[difficulty]:
+        typ, t_start, t_end, *_ = onset
+        # Assuming N_TYPES in config is appropriately set (e.g., 7 or more)
+        if typ < 1 or typ > N_TYPES:  # Filter out invalid types
+            continue
+        num_valid_notes += 1
+        f = int(round(t_start.item() * fps))
+        idx = f // TIME_SUB
+        if 0 <= idx < T_sub:
+            # Apply exponential decay kernel to the corresponding energy channel
+            # Type 1 and 3 are Don
+            if typ == 1 or typ == 3:
+                for i, val in enumerate(tail_kernel):
+                    target_idx = idx + i
+                    if 0 <= target_idx < T_sub:
+                        don_labels[target_idx] = max(
+                            don_labels[target_idx].item(), val.item()
+                        )
+            # Type 2 and 4 are Ka
+            elif typ == 2 or typ == 4:
+                for i, val in enumerate(tail_kernel):
+                    target_idx = idx + i
+                    if 0 <= target_idx < T_sub:
+                        ka_labels[target_idx] = max(
+                            ka_labels[target_idx].item(), val.item()
+                        )
+            # Type 5, 6, 7 are Drumroll
+            elif typ >= 5 and typ <= 7:
+                f_end = int(round(t_end.item() * fps))
+                idx_end = f_end // TIME_SUB
+                for dr in range(idx, idx_end):
+                    if 0 <= dr < T_sub:
+                        drumroll_labels[dr] = 1.0
+                for i, val in enumerate(tail_kernel):
+                    target_idx = idx_end + i
+                    if 0 <= target_idx < T_sub:
+                        drumroll_labels[target_idx] = max(
+                            drumroll_labels[target_idx].item(), val.item()
+                        )
+    duration_seconds = wav_tensor.shape[-1] / SAMPLE_RATE
+    nps = num_valid_notes / duration_seconds if duration_seconds > 0 else 0.0
+    print(
+        f"Processed {num_valid_notes} notes in {duration_seconds:.2f} seconds, NPS: {nps:.2f}"
+    )
+    return {
+        "mel": mel,
+        "don_labels": don_labels,
+        "ka_labels": ka_labels,
+        "drumroll_labels": drumroll_labels,
+        "nps": torch.tensor(nps, dtype=torch.float32),
+        "duration_seconds": torch.tensor(duration_seconds, dtype=torch.float32),
+    }
+def collate_fn(batch):
+    mels_list = [b["mel"].squeeze(0).transpose(0, 1) for b in batch]  # (T, N_MELS)
+    # Extract new energy-based labels
+    don_labels_list = [b["don_labels"] for b in batch]
+    ka_labels_list = [b["ka_labels"] for b in batch]
+    drumroll_labels_list = [b["drumroll_labels"] for b in batch]
+    nps_list = [b["nps"] for b in batch]  # Extract NPS
+    durations_list = [b["duration_seconds"] for b in batch]  # Extract durations
+    # Pad mels
+    padded_mels = nn.utils.rnn.pad_sequence(
+        mels_list, batch_first=True
+    )  # (B, T_max, N_MELS)
+    # Reshape for CNN: (B, 1, N_MELS, T_max)
+    reshaped_mels = padded_mels.transpose(1, 2).unsqueeze(1)
+    # Simulate CNN time downsampling to get output lengths
+    dummy_model_for_shape_inference = TaikoConformer5()
+    dummy_cnn = dummy_model_for_shape_inference.cnn
+    with torch.no_grad():
+        cnn_out = dummy_cnn(reshaped_mels)  # Use reshaped_mels that has batch dim
+        _, _, _, T_cnn = cnn_out.shape
+    padded_don_labels = []
+    padded_ka_labels = []
+    padded_drumroll_labels = []
+    # lengths = [] # This was for original presence/type labels, conformer_input_lengths is used for model
+    for i in range(len(batch)):
+        d_labels = don_labels_list[i]
+        k_labels = ka_labels_list[i]
+        dr_labels = drumroll_labels_list[i]
+        item_original_T_sub = d_labels.shape[
+            0
+        ]  # Assuming all label types have same original length
+        out_len = T_cnn  # Target length for labels is T_cnn
+        # Pad or truncate don_labels
+        if item_original_T_sub < out_len:
+            pad_d = torch.full(
+                (out_len - item_original_T_sub,),
+                0,  # Pad with 0 for energy labels
+                dtype=d_labels.dtype,
+                device=d_labels.device,
+            )
+            padded_d = torch.cat([d_labels, pad_d], dim=0)
+        else:
+            padded_d = d_labels[:out_len]
+        padded_don_labels.append(padded_d)
+        # Pad or truncate ka_labels
+        if item_original_T_sub < out_len:
+            pad_k = torch.full(
+                (out_len - item_original_T_sub,),
+                0,  # Pad with 0 for energy labels
+                dtype=k_labels.dtype,
+                device=k_labels.device,
+            )
+            padded_k = torch.cat([k_labels, pad_k], dim=0)
+        else:
+            padded_k = k_labels[:out_len]
+        padded_ka_labels.append(padded_k)
+        # Pad or truncate drumroll_labels
+        if item_original_T_sub < out_len:
+            pad_dr = torch.full(
+                (out_len - item_original_T_sub,),
+                0,  # Pad with 0 for energy labels
+                dtype=dr_labels.dtype,
+                device=dr_labels.device,
+            )
+            padded_dr = torch.cat([dr_labels, pad_dr], dim=0)
+        else:
+            padded_dr = dr_labels[:out_len]
+        padded_drumroll_labels.append(padded_dr)
+    # For Conformer input lengths: lengths of mel sequences after CNN subsampling
+    # (Assuming CNN does not subsample in time, T_cnn is effectively T_mel_padded)
+    # The `lengths` for the Conformer should be based on the mel input to the conformer part.
+    # The existing calculation for conformer_input_lengths seems to relate to TIME_SUB.
+    # If the Conformer input itself is not subsampled by TIME_SUB, this might need review.
+    # For now, keeping the existing conformer_input_lengths logic as it's outside the scope of label change.
+    conformer_input_lengths = [
+        math.ceil(mels_list[i].shape[0] / TIME_SUB) for i in range(len(batch))
+    ]
+    conformer_input_lengths = torch.tensor(
+        [min(l, T_cnn) for l in conformer_input_lengths], dtype=torch.long
+    )
+    return {
+        "mel": reshaped_mels,
+        "don_labels": torch.stack(padded_don_labels),
+        "ka_labels": torch.stack(padded_ka_labels),
+        "drumroll_labels": torch.stack(padded_drumroll_labels),
+        "lengths": conformer_input_lengths,  # These are for the Conformer model
+        "nps": torch.stack(nps_list),
+        "durations": torch.stack(durations_list),
+    }

tc5/train.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from accelerate.utils import set_seed
+set_seed(1024)
+import math
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from datasets import concatenate_datasets
+import matplotlib.pyplot as plt
+import numpy as np
+from .config import (
+    BATCH_SIZE,
+    DEVICE,
+    EPOCHS,
+    LR,
+    GRAD_ACCUM_STEPS,
+    HOP_LENGTH,
+    SAMPLE_RATE,
+)
+from .model import TaikoConformer5
+from .dataset import ds
+from .preprocess import preprocess, collate_fn
+from .loss import TaikoEnergyLoss
+from huggingface_hub import upload_folder
+# --- Helper function to log energy plots ---
+def log_energy_plots_to_tensorboard(
+    writer,
+    tag_prefix,
+    epoch,
+    pred_don,
+    pred_ka,
+    pred_drumroll,
+    true_don,
+    true_ka,
+    true_drumroll,
+    valid_length,  # Actual valid length of the sequence (before padding)
+    hop_sec,
+):
+    """
+    Logs a plot of predicted vs. true energies for one sample to TensorBoard.
+    Energies should be 1D numpy arrays for the single sample, up to valid_length.
+    """
+    # Ensure data is on CPU and converted to numpy, and select only the valid part
+    pred_don = pred_don[:valid_length].detach().cpu().numpy()
+    pred_ka = pred_ka[:valid_length].detach().cpu().numpy()
+    pred_drumroll = pred_drumroll[:valid_length].detach().cpu().numpy()
+    true_don = true_don[:valid_length].cpu().numpy()
+    true_ka = true_ka[:valid_length].cpu().numpy()
+    true_drumroll = true_drumroll[:valid_length].cpu().numpy()
+    time_axis = np.arange(valid_length) * hop_sec
+    fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
+    fig.suptitle(f"{tag_prefix} - Epoch {epoch}", fontsize=16)
+    axs[0].plot(time_axis, true_don, label="True Don", color="blue", linestyle="--")
+    axs[0].plot(time_axis, pred_don, label="Pred Don", color="lightblue", alpha=0.8)
+    axs[0].set_ylabel("Don Energy")
+    axs[0].legend()
+    axs[0].grid(True)
+    axs[1].plot(time_axis, true_ka, label="True Ka", color="red", linestyle="--")
+    axs[1].plot(time_axis, pred_ka, label="Pred Ka", color="lightcoral", alpha=0.8)
+    axs[1].set_ylabel("Ka Energy")
+    axs[1].legend()
+    axs[1].grid(True)
+    axs[2].plot(
+        time_axis, true_drumroll, label="True Drumroll", color="green", linestyle="--"
+    )
+    axs[2].plot(
+        time_axis, pred_drumroll, label="Pred Drumroll", color="lightgreen", alpha=0.8
+    )
+    axs[2].set_ylabel("Drumroll Energy")
+    axs[2].set_xlabel("Time (s)")
+    axs[2].legend()
+    axs[2].grid(True)
+    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout to make space for suptitle
+    writer.add_figure(f"{tag_prefix}/Energy_Comparison", fig, epoch)
+    plt.close(fig)
+def main():
+    global ds
+    # Calculate hop seconds for model output frames
+    # This assumes the model output time dimension corresponds to the mel spectrogram time dimension
+    output_frame_hop_sec = HOP_LENGTH / SAMPLE_RATE
+    best_val_loss = float("inf")
+    patience = 10  # Increased patience a bit
+    pat_count = 0
+    ds_oni = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "oni"},
+        writer_batch_size=10,
+    )
+    ds_hard = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "hard"},
+        writer_batch_size=10,
+    )
+    ds_normal = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "normal"},
+        writer_batch_size=10,
+    )
+    ds = concatenate_datasets([ds_oni, ds_hard, ds_normal])
+    ds_train_test = ds.train_test_split(test_size=0.1, seed=42)
+    train_loader = DataLoader(
+        ds_train_test["train"],
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=2,
+    )
+    val_loader = DataLoader(
+        ds_train_test["test"],
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        collate_fn=collate_fn,
+        num_workers=2,
+    )
+    model = TaikoConformer5().to(DEVICE)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
+    criterion = TaikoEnergyLoss(reduction="mean").to(DEVICE)
+    # Adjust scheduler steps for gradient accumulation
+    num_optimizer_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
+    total_optimizer_steps = EPOCHS * num_optimizer_steps_per_epoch
+    scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optimizer, max_lr=LR, total_steps=total_optimizer_steps
+    )
+    writer = SummaryWriter()
+    for epoch in range(1, EPOCHS + 1):
+        model.train()
+        total_epoch_loss = 0.0
+        optimizer.zero_grad()
+        for idx, batch in enumerate(tqdm(train_loader, desc=f"Train Epoch {epoch}")):
+            mel = batch["mel"].to(DEVICE)
+            # Unpack new energy-based labels
+            don_labels = batch["don_labels"].to(DEVICE)
+            ka_labels = batch["ka_labels"].to(DEVICE)
+            drumroll_labels = batch["drumroll_labels"].to(DEVICE)
+            lengths = batch["lengths"].to(
+                DEVICE
+            )  # These are for the Conformer model output
+            nps = batch["nps"].to(DEVICE)
+            output_dict = model(mel, lengths, nps)
+            # output_dict["presence"] is now (B, T_out, 3) for don, ka, drumroll energies
+            pred_energies_batch = output_dict["presence"]  # (B, T_out, 3)
+            loss_input_batch = {
+                "don_labels": don_labels,
+                "ka_labels": ka_labels,
+                "drumroll_labels": drumroll_labels,
+                "lengths": lengths,  # Pass lengths for masking within the loss function
+            }
+            loss = criterion(output_dict, loss_input_batch)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            if (idx + 1) % GRAD_ACCUM_STEPS == 0 or (idx + 1) == len(train_loader):
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            total_epoch_loss += loss.item()
+            # Log plot for the first sample of the first batch in each training epoch
+            if idx == 0:
+                first_sample_pred_don = pred_energies_batch[0, :, 0]
+                first_sample_pred_ka = pred_energies_batch[0, :, 1]
+                first_sample_pred_drumroll = pred_energies_batch[0, :, 2]
+                first_sample_true_don = don_labels[0, :]
+                first_sample_true_ka = ka_labels[0, :]
+                first_sample_true_drumroll = drumroll_labels[0, :]
+                first_sample_length = lengths[
+                    0
+                ].item()  # Get the valid length of the first sample
+                log_energy_plots_to_tensorboard(
+                    writer,
+                    "Train/Sample_0",
+                    epoch,
+                    first_sample_pred_don,
+                    first_sample_pred_ka,
+                    first_sample_pred_drumroll,
+                    first_sample_true_don,
+                    first_sample_true_ka,
+                    first_sample_true_drumroll,
+                    first_sample_length,
+                    output_frame_hop_sec,
+                )
+        avg_train_loss = total_epoch_loss / len(train_loader)
+        writer.add_scalar("Loss/Train_Avg", avg_train_loss, epoch)
+        # Validation
+        model.eval()
+        total_val_loss = 0.0
+        # Removed storage for classification logits/labels and confusion matrix components
+        with torch.no_grad():
+            for val_idx, batch in enumerate(
+                tqdm(val_loader, desc=f"Val Epoch {epoch}")
+            ):
+                mel = batch["mel"].to(DEVICE)
+                don_labels = batch["don_labels"].to(DEVICE)
+                ka_labels = batch["ka_labels"].to(DEVICE)
+                drumroll_labels = batch["drumroll_labels"].to(DEVICE)
+                lengths = batch["lengths"].to(DEVICE)
+                nps = batch["nps"].to(DEVICE)  # Ground truth NPS from batch
+                output_dict = model(mel, lengths, nps)
+                pred_energies_val_batch = output_dict["presence"]  # (B, T_out, 3)
+                val_loss_input_batch = {
+                    "don_labels": don_labels,
+                    "ka_labels": ka_labels,
+                    "drumroll_labels": drumroll_labels,
+                    "lengths": lengths,
+                }
+                val_loss = criterion(output_dict, val_loss_input_batch)
+                total_val_loss += val_loss.item()
+                # Log plot for the first sample of the first batch in each validation epoch
+                if val_idx == 0:
+                    first_val_sample_pred_don = pred_energies_val_batch[0, :, 0]
+                    first_val_sample_pred_ka = pred_energies_val_batch[0, :, 1]
+                    first_val_sample_pred_drumroll = pred_energies_val_batch[0, :, 2]
+                    first_val_sample_true_don = don_labels[0, :]
+                    first_val_sample_true_ka = ka_labels[0, :]
+                    first_val_sample_true_drumroll = drumroll_labels[0, :]
+                    first_val_sample_length = lengths[0].item()
+                    log_energy_plots_to_tensorboard(
+                        writer,
+                        "Eval/Sample_0",
+                        epoch,
+                        first_val_sample_pred_don,
+                        first_val_sample_pred_ka,
+                        first_val_sample_pred_drumroll,
+                        first_val_sample_true_don,
+                        first_val_sample_true_ka,
+                        first_val_sample_true_drumroll,
+                        first_val_sample_length,
+                        output_frame_hop_sec,
+                    )
+                # Log ground truth NPS for reference during validation if needed
+                # writer.add_scalar("NPS/GT_Val_Batch_Avg", nps.mean().item(), epoch * len(val_loader) + idx)
+        avg_val_loss = total_val_loss / len(val_loader)
+        writer.add_scalar("Loss/Val_Avg", avg_val_loss, epoch)
+        # Log learning rate
+        current_lr = optimizer.param_groups[0]["lr"]
+        writer.add_scalar("LR/learning_rate", current_lr, epoch)
+        # Log ground truth NPS from the last validation batch (or mean over epoch)
+        if "nps" in batch:  # Check if nps is in the last batch
+            writer.add_scalar(
+                "NPS/GT_Val_LastBatch_Avg", batch["nps"].mean().item(), epoch
+            )
+        print(
+            f"Epoch {epoch:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | LR: {current_lr:.2e}"
+        )
+        if avg_val_loss < best_val_loss:
+            best_val_loss = avg_val_loss
+            pat_count = 0
+            torch.save(model.state_dict(), "best_model.pt")  # Changed model save name
+            print(f"Saved new best model to best_model.pt at epoch {epoch}")
+        else:
+            pat_count += 1
+            if pat_count >= patience:
+                print("Early stopping!")
+                break
+    writer.close()
+    model_id = "JacobLinCool/taiko-conformer-5"
+    try:
+        model.push_to_hub(model_id, commit_message="Upload trained model")
+        upload_folder(
+            repo_id=model_id,
+            folder_path="runs",
+            path_in_repo=".",
+            commit_message="Upload training logs",
+            ignore_patterns=["*.txt", "*.json", "*.csv"],
+        )
+        print(f"Model and logs uploaded to {model_id}")
+    except Exception as e:
+        print(f"Error uploading to Hugging Face Hub: {e}")
+        print("Make sure you have the correct permissions and try again.")
+if __name__ == "__main__":
+    main()

tc6/__init__.py ADDED Viewed

File without changes

tc6/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+# ─── 1) CONFIG ─────────────────────────────────────────────────────
+SAMPLE_RATE = 22050
+N_MELS = 80
+HOP_LENGTH = 256
+TIME_SUB = 1
+CNN_CH = 256
+N_HEADS = 8
+D_MODEL = 512
+FF_DIM = 1024
+N_LAYERS = 6
+DEPTHWISE_CONV_KERNEL_SIZE = 31
+DROPOUT = 0.1
+HIDDEN_DIM = 64
+N_TYPES = 7
+BATCH_SIZE = 2
+GRAD_ACCUM_STEPS = 8
+LR = 3e-4
+EPOCHS = 200
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)

tc6/dataset.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from datasets import load_dataset, concatenate_datasets
+ds1 = load_dataset("JacobLinCool/taiko-2023-1.1", split="train")
+ds2 = load_dataset("JacobLinCool/taiko-2023-1.2", split="train")
+ds3 = load_dataset("JacobLinCool/taiko-2023-1.3", split="train")
+ds4 = load_dataset("JacobLinCool/taiko-2023-1.4", split="train")
+ds5 = load_dataset("JacobLinCool/taiko-2023-1.5", split="train")
+ds6 = load_dataset("JacobLinCool/taiko-2023-1.6", split="train")
+ds7 = load_dataset("JacobLinCool/taiko-2023-1.7", split="train")
+ds = concatenate_datasets([ds1, ds2, ds3, ds4, ds5, ds6, ds7]).with_format("torch")
+good = list(range(len(ds)))
+good.remove(1079)  # 1079 has file problem
+ds = ds.select(good)
+# for local test
+# ds = (
+#     load_dataset("JacobLinCool/taiko-2023-1.6", split="train")
+#     .with_format("torch")
+#     .select(range(10))
+# )

tc6/infer.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import time
+import torch
+import torchaudio
+import matplotlib.pyplot as plt
+import numpy as np
+from .config import SAMPLE_RATE, N_MELS, HOP_LENGTH
+import torch.profiler
+# --- PREPROCESSING (match training) ---
+def preprocess_audio(audio_path):
+    wav, sr = torchaudio.load(audio_path)
+    wav = wav.mean(dim=0)  # mono
+    if sr != SAMPLE_RATE:
+        wav = torchaudio.functional.resample(wav, sr, SAMPLE_RATE)
+    wav = wav / (wav.abs().max() + 1e-8)  # Normalize audio
+    mel_transform = torchaudio.transforms.MelSpectrogram(
+        sample_rate=SAMPLE_RATE,
+        n_mels=N_MELS,
+        hop_length=HOP_LENGTH,
+        n_fft=2048,
+    )
+    mel = mel_transform(wav)
+    return mel  # mel is (N_MELS, T_mel)
+# --- INFERENCE ---
+def run_inference(model, mel_input, nps_input, difficulty_input, level_input, device):
+    model.eval()
+    with torch.no_grad():
+        mel = mel_input.to(device).unsqueeze(0)  # (1, N_MELS, T_mel)
+        nps = nps_input.to(device).unsqueeze(0)  # (1,)
+        difficulty = difficulty_input.to(device).unsqueeze(0)  # (1,)
+        level = level_input.to(device).unsqueeze(0)  # (1,)
+        mel_cnn_input = mel.unsqueeze(1)  # (1, 1, N_MELS, T_mel)
+        conformer_lengths = torch.tensor(
+            [mel_cnn_input.shape[-1]], dtype=torch.long, device=device
+        )
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                *(
+                    [torch.profiler.ProfilerActivity.CUDA]
+                    if device.type == "cuda"
+                    else []
+                ),
+            ],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=False,
+            with_flops=True,
+        ) as prof:
+            out_dict = model(mel_cnn_input, conformer_lengths, nps, difficulty, level)
+        print(
+            prof.key_averages().table(
+                sort_by=(
+                    "self_cuda_memory_usage"
+                    if device.type == "cuda"
+                    else "self_cpu_time_total"
+                ),
+                row_limit=20,
+            )
+        )
+        energies = out_dict["presence"].squeeze(0).cpu().numpy()
+        don_energy = energies[:, 0]
+        ka_energy = energies[:, 1]
+        drumroll_energy = energies[:, 2]
+    return don_energy, ka_energy, drumroll_energy
+# --- DECODE TO ONSETS ---
+def decode_onsets(
+    don_energy,
+    ka_energy,
+    drumroll_energy,
+    hop_sec,
+    threshold=0.5,
+    min_distance_frames=3,
+):
+    results = []
+    T_out = len(don_energy)
+    last_onset_frame = -min_distance_frames
+    for i in range(1, T_out - 1):  # Iterate considering neighbors for peak detection
+        if i < last_onset_frame + min_distance_frames:
+            continue
+        e_don, e_ka, e_drum = don_energy[i], ka_energy[i], drumroll_energy[i]
+        energies_at_i = {
+            1: e_don,
+            2: e_ka,
+            5: e_drum,
+        }  # Type mapping: 1:Don, 2:Ka, 5:Drumroll
+        # Find which energy is max and if it's a peak above threshold
+        # Sort by energy value descending to prioritize higher energy in case of ties for peak condition
+        sorted_types_by_energy = sorted(
+            energies_at_i.keys(), key=lambda x: energies_at_i[x], reverse=True
+        )
+        detected_this_frame = False
+        for onset_type in sorted_types_by_energy:
+            current_energy_series = None
+            if onset_type == 1:
+                current_energy_series = don_energy
+            elif onset_type == 2:
+                current_energy_series = ka_energy
+            elif onset_type == 5:
+                current_energy_series = drumroll_energy
+            energy_val = current_energy_series[i]
+            if (
+                energy_val > threshold
+                and energy_val > current_energy_series[i - 1]
+                and energy_val > current_energy_series[i + 1]
+            ):
+                # Check if this energy is the highest among the three at this frame
+                # This check is implicitly handled by iterating `sorted_types_by_energy`
+                # and breaking after the first detection.
+                results.append((i * hop_sec, onset_type))
+                last_onset_frame = i
+                detected_this_frame = True
+                break  # Only one onset type per frame
+    return results
+# --- VISUALIZATION ---
+def plot_results(
+    mel_spectrogram,
+    don_energy,
+    ka_energy,
+    drumroll_energy,
+    onsets,
+    hop_sec,
+    out_path=None,
+):
+    # mel_spectrogram is (N_MELS, T_mel)
+    T_mel = mel_spectrogram.shape[1]
+    T_out = len(don_energy)  # Length of energy arrays (model output time dimension)
+    # Time axes
+    time_axis_mel = np.arange(T_mel) * (HOP_LENGTH / SAMPLE_RATE)
+    # hop_sec for model output is (HOP_LENGTH * TIME_SUB) / SAMPLE_RATE
+    # However, the model output T_out is related to T_mel (input to CNN).
+    # If CNN does not change time dimension, T_out = T_mel.
+    # If TIME_SUB is used for label generation, T_out = T_mel / TIME_SUB.
+    # The `lengths` passed to conformer in `run_inference` is T_mel.
+    # The output of conformer `x` has shape (B, T, D_MODEL). This T is `lengths`.
+    # So, T_out from model is T_mel.
+    # The `hop_sec` for onsets should be based on the model output frame rate.
+    # If model output T_out corresponds to T_mel, then hop_sec for plotting energies is HOP_LENGTH / SAMPLE_RATE.
+    # The `hop_sec` passed to `decode_onsets` is (HOP_LENGTH * TIME_SUB) / SAMPLE_RATE.
+    # This seems inconsistent. Let's clarify: `TIME_SUB` is used in `preprocess.py` to determine `T_sub` for labels.
+    # The model's CNN output time dimension T_cnn is used to pad/truncate labels in `collate_fn`.
+    # In `model.py`, the CNN does not stride in time. So T_cnn_out = T_mel_input_to_CNN.
+    # The `lengths` for the conformer is based on this T_cnn_out.
+    # So, the output of the regressor `presence` (B, T_cnn_out, 3) has T_cnn_out time steps.
+    # Each step corresponds to `HOP_LENGTH / SAMPLE_RATE` seconds if TIME_SUB is not involved in downsampling features for the conformer.
+    # Let's assume the `hop_sec` used for `decode_onsets` is correct for interpreting model output frames.
+    time_axis_energies = np.arange(T_out) * hop_sec
+    fig, ax1 = plt.subplots(figsize=(100, 10))
+    # Plot Mel Spectrogram on ax1
+    mel_db = torchaudio.functional.amplitude_to_DB(
+        mel_spectrogram, multiplier=10.0, amin=1e-10, db_multiplier=0.0
+    )
+    img = ax1.imshow(
+        mel_db.numpy(),
+        aspect="auto",
+        origin="lower",
+        cmap="magma",
+        extent=[time_axis_mel[0], time_axis_mel[-1], 0, N_MELS],
+    )
+    ax1.set_title("Mel Spectrogram with Predicted Energies and Onsets")
+    ax1.set_xlabel("Time (s)")
+    ax1.set_ylabel("Mel Bin")
+    fig.colorbar(img, ax=ax1, format="%+2.0f dB")
+    # Create a second y-axis for energies
+    ax2 = ax1.twinx()
+    ax2.plot(time_axis_energies, don_energy, label="Don Energy", color="red")
+    ax2.plot(time_axis_energies, ka_energy, label="Ka Energy", color="blue")
+    ax2.plot(
+        time_axis_energies, drumroll_energy, label="Drumroll Energy", color="green"
+    )
+    ax2.set_ylabel("Energy")
+    ax2.set_ylim(0, 1.2)  # Assuming energies are somewhat normalized or bounded
+    # Overlay onsets from decode_onsets (t is already in seconds)
+    labeled_types = set()
+    # Group drumrolls into segments (reuse logic from write_tja)
+    drumroll_times = [t_sec for t_sec, typ in onsets if typ == 5]
+    drumroll_times.sort()
+    drumroll_segments = []
+    if drumroll_times:
+        seg_start = drumroll_times[0]
+        prev = drumroll_times[0]
+        for t in drumroll_times[1:]:
+            if t - prev <= hop_sec * 6:  # up to 5-frame gap
+                prev = t
+            else:
+                drumroll_segments.append((seg_start, prev))
+                seg_start = t
+                prev = t
+        drumroll_segments.append((seg_start, prev))
+    # Plot Don/Ka onsets as vertical lines
+    for t_sec, typ in onsets:
+        if typ == 5:
+            continue  # skip drumroll onsets
+        color_map = {1: "darkred", 2: "darkblue"}
+        label_map = {1: "Don Onset", 2: "Ka Onset"}
+        line_color = color_map.get(typ, "black")
+        line_label = label_map.get(typ, f"Type {typ} Onset")
+        if typ not in labeled_types:
+            ax1.axvline(
+                t_sec, color=line_color, linestyle="--", alpha=0.9, label=line_label
+            )
+            labeled_types.add(typ)
+        else:
+            ax1.axvline(t_sec, color=line_color, linestyle="--", alpha=0.9)
+    # Plot drumroll segments as shaded regions
+    for seg_start, seg_end in drumroll_segments:
+        ax1.axvspan(
+            seg_start,
+            seg_end + hop_sec,
+            color="green",
+            alpha=0.2,
+            label="Drumroll Segment" if "drumroll" not in labeled_types else None,
+        )
+        labeled_types.add("drumroll")
+    # Combine legends from both axes
+    lines, labels = ax1.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax2.legend(lines + lines2, labels + labels2, loc="upper right")
+    fig.tight_layout()
+    # Return plot as image buffer or save to file if path provided
+    if out_path:
+        plt.savefig(out_path)
+        print(f"Saved plot to {out_path}")
+        plt.close(fig)
+        return out_path
+    else:
+        # Return plot as in-memory buffer
+        return fig
+def write_tja(onsets, out_path=None, bpm=160, quantize=96, audio="audio.wav"):
+    # TJA types: 0:no note, 1:Don, 2:Ka, 3:BigDon, 4:BigKa, 5:DrumrollStart, 8:DrumrollEnd
+    # Model output types: 1:Don, 2:Ka, 5:Drumroll (interpreted as start/single)
+    sec_per_beat = 60 / bpm
+    beats_per_measure = 4  # Assuming 4/4 time signature
+    sec_per_measure = sec_per_beat * beats_per_measure
+    # Step 1: Map onsets to (measure_idx, slot, typ)
+    slot_events = []
+    for t, typ in onsets:
+        measure_idx = int(t // sec_per_measure)
+        t_in_measure = t % sec_per_measure
+        slot = int(round(t_in_measure / sec_per_measure * quantize))
+        if slot >= quantize:
+            slot = quantize - 1
+        slot_events.append((measure_idx, slot, typ))
+    # Step 2: Build measure/slot grid
+    if slot_events:
+        max_measure_idx = max(m for m, _, _ in slot_events)
+    else:
+        max_measure_idx = -1
+    measures = {i: [0] * quantize for i in range(max_measure_idx + 1)}
+    # Step 3: Place Don/Ka, collect drumrolls
+    drumroll_slots = set()
+    for m, s, typ in slot_events:
+        if typ in [1, 2]:
+            measures[m][s] = typ
+        elif typ == 5:
+            drumroll_slots.add((m, s))
+    # Step 4: Process drumrolls into contiguous regions, mark 5 (start) and 8 (end)
+    # Flatten all slots to a list of (measure, slot) sorted
+    drumroll_list = sorted(list(drumroll_slots))
+    # Group into contiguous regions (allowing a gap of 5 slots)
+    grouped = []
+    group = []
+    for ms in drumroll_list:
+        if not group:
+            group = [ms]
+        else:
+            last_m, last_s = group[-1]
+            m, s = ms
+            # Calculate slot distance, considering measure wrap
+            slot_dist = None
+            if m == last_m:
+                slot_dist = s - last_s
+            elif m == last_m + 1 and last_s <= quantize - 1:
+                slot_dist = (quantize - 1 - last_s) + s + 1
+            else:
+                slot_dist = None
+            # Allow gap of up to 5 slots (slot_dist <= 6)
+            if slot_dist is not None and 1 <= slot_dist <= 6:
+                group.append(ms)
+            else:
+                grouped.append(group)
+                group = [ms]
+    if group:
+        grouped.append(group)
+    # Mark 5 (start) and 8 (end) for each group
+    for region in grouped:
+        if len(region) == 1:
+            m, s = region[0]
+            measures[m][s] = 5
+            # Place 8 in next slot (or next measure if at end)
+            if s < quantize - 1:
+                measures[m][s + 1] = 8
+            elif m < max_measure_idx:
+                measures[m + 1][0] = 8
+        else:
+            m_start, s_start = region[0]
+            m_end, s_end = region[-1]
+            measures[m_start][s_start] = 5
+            measures[m_end][s_end] = 8
+            # Fill 0 for middle slots (already 0 by default)
+    # Step 5: Generate TJA content
+    tja_content = []
+    tja_content.append(f"TITLE:{audio} (TC6, {time.strftime('%Y-%m-%d %H:%M:%S')})")
+    tja_content.append(f"BPM:{bpm}")
+    tja_content.append(f"WAVE:{audio}")
+    tja_content.append("OFFSET:0")
+    tja_content.append("COURSE:Oni\nLEVEL:9\n")
+    tja_content.append("#START")
+    for i in range(max_measure_idx + 1):
+        notes = measures.get(i, [0] * quantize)
+        line = "".join(str(n) for n in notes)
+        tja_content.append(line + ",")
+    tja_content.append("#END")
+    tja_string = "\n".join(tja_content)
+    # If out_path is provided, also write to file
+    if out_path:
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write(tja_string)
+        print(f"TJA chart saved to {out_path}")
+    return tja_string

tc6/loss.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import torch.nn as nn
+class TaikoEnergyLoss(nn.Module):
+    def __init__(self, reduction="mean"):
+        super().__init__()
+        # Use 'none' reduction to get element-wise losses, then manually apply masking and reduction
+        self.mse_loss = nn.MSELoss(reduction="none")
+        self.reduction = reduction
+    def forward(self, outputs, batch):
+        """
+        Calculates the MSE loss for energy-based predictions.
+        Args:
+            outputs (dict): Model output, containing 'presence' tensor.
+                            outputs['presence'] shape: (B, T, 3) for don, ka, drumroll energies.
+            batch (dict): Batch data from collate_fn, containing true labels and lengths.
+                          batch['don_labels'], batch['ka_labels'], batch['drumroll_labels'] shape: (B, T)
+                          batch['lengths'] shape: (B,) - valid sequence lengths for time dimension T.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        pred_energies = outputs["presence"]  # (B, T, 3)
+        true_don = batch["don_labels"]  # (B, T)
+        true_ka = batch["ka_labels"]  # (B, T)
+        true_drumroll = batch["drumroll_labels"]  # (B, T)
+        # Stack true labels to match the structure of pred_energies (B, T, 3)
+        true_energies = torch.stack([true_don, true_ka, true_drumroll], dim=2)
+        B, T, _ = pred_energies.shape
+        # Create a mask based on batch['lengths'] to ignore padded parts of sequences
+        # batch['lengths'] gives the actual length of each sequence in the batch
+        # mask shape: (B, T)
+        mask_2d = torch.arange(T, device=pred_energies.device).expand(B, T) < batch[
+            "lengths"
+        ].unsqueeze(1)
+        # Expand mask to (B, T, 1) to broadcast across the 3 energy channels
+        mask_3d = mask_2d.unsqueeze(2)
+        # Calculate element-wise MSE loss
+        loss_elementwise = self.mse_loss(pred_energies, true_energies)  # (B, T, 3)
+        # Apply the mask to the loss
+        masked_loss = loss_elementwise * mask_3d
+        if self.reduction == "mean":
+            # Sum the loss over all valid (unmasked) elements and divide by the number of valid elements
+            total_loss = masked_loss.sum()
+            num_valid_elements = mask_3d.sum()  # Total number of unmasked float values
+            if num_valid_elements > 0:
+                return total_loss / num_valid_elements
+            else:
+                # Avoid division by zero if there are no valid elements (e.g., empty batch or all lengths are 0)
+                return torch.tensor(
+                    0.0, device=pred_energies.device, requires_grad=True
+                )
+        elif self.reduction == "sum":
+            return masked_loss.sum()
+        else:  # 'none' or any other case
+            return masked_loss

tc6/model.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+import torch.nn as nn
+from torchaudio.models import Conformer
+from huggingface_hub import PyTorchModelHubMixin
+from .config import (
+    N_MELS,
+    CNN_CH,
+    N_HEADS,
+    D_MODEL,
+    FF_DIM,
+    N_LAYERS,
+    DROPOUT,
+    DEPTHWISE_CONV_KERNEL_SIZE,
+    HIDDEN_DIM,
+    DEVICE,
+)
+class TaikoConformer6(nn.Module, PyTorchModelHubMixin):
+    def __init__(self):
+        super().__init__()
+        # 1) CNN frontend: frequency-only pooling
+        self.cnn = nn.Sequential(
+            nn.Conv2d(1, CNN_CH, 3, stride=(2, 1), padding=1),
+            nn.BatchNorm2d(CNN_CH),
+            nn.GELU(),
+            nn.Dropout2d(DROPOUT),
+            nn.Conv2d(CNN_CH, CNN_CH, 3, stride=(2, 1), padding=1),
+            nn.BatchNorm2d(CNN_CH),
+            nn.GELU(),
+            nn.Dropout2d(DROPOUT),
+        )
+        feat_dim = CNN_CH * (N_MELS // 4)
+        # 2) Linear projection to model dimension
+        self.proj = nn.Linear(feat_dim, D_MODEL)
+        # 3) FiLM conditioning for notes_per_second, difficulty, and level
+        self.film_nps = nn.Linear(1, 2 * D_MODEL)
+        self.film_difficulty = nn.Linear(
+            1, 2 * D_MODEL
+        )  # Assuming difficulty is a single scalar
+        self.film_level = nn.Linear(1, 2 * D_MODEL)  # Assuming level is a single scalar
+        # 4) Conformer encoder
+        self.encoder = Conformer(
+            input_dim=D_MODEL,
+            num_heads=N_HEADS,
+            ffn_dim=FF_DIM,
+            num_layers=N_LAYERS,
+            depthwise_conv_kernel_size=DEPTHWISE_CONV_KERNEL_SIZE,
+            dropout=DROPOUT,
+            use_group_norm=False,
+            convolution_first=False,
+        )
+        # 5) Presence regressor head
+        self.presence_regressor = nn.Sequential(
+            nn.Dropout(DROPOUT),
+            nn.Linear(D_MODEL, HIDDEN_DIM),
+            nn.GELU(),
+            nn.Dropout(DROPOUT),
+            nn.Linear(HIDDEN_DIM, 3),  # Don, Ka, DrumRoll energy
+            nn.Sigmoid(),  # Output between 0 and 1
+        )
+        # 6) Initialize weights
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def forward(
+        self,
+        mel: torch.Tensor,
+        lengths: torch.Tensor,
+        notes_per_second: torch.Tensor,
+        difficulty: torch.Tensor,
+        level: torch.Tensor,
+    ):
+        """
+        Args:
+            mel: (B, 1, N_MELS, T_mel)
+            lengths: (B,) lengths after CNN
+            notes_per_second: (B,) stream of control values
+            difficulty: (B,) difficulty values
+            level: (B,) level values
+        Returns:
+            Dict with:
+                'presence': (B, T_cnn_out, 3) # Corrected from 4 to 3
+                'lengths': lengths
+        """
+        # CNN frontend
+        x = self.cnn(mel)  # (B, C, F, T)
+        B, C, F, T = x.size()
+        x = x.permute(0, 3, 1, 2).reshape(B, T, C * F)
+        # Project to model dimension
+        x = self.proj(x)  # (B, T, D_MODEL)
+        # FiLM conditioning
+        nps = notes_per_second.unsqueeze(-1).float()  # (B, 1)
+        gamma_beta_nps = self.film_nps(nps)  # (B, 2*D_MODEL)
+        gamma_nps, beta_nps = gamma_beta_nps.chunk(2, dim=-1)
+        x = gamma_nps.unsqueeze(1) * x + beta_nps.unsqueeze(1)
+        diff = difficulty.unsqueeze(-1).float()  # (B, 1)
+        gamma_beta_diff = self.film_difficulty(diff)  # (B, 2*D_MODEL)
+        gamma_diff, beta_diff = gamma_beta_diff.chunk(2, dim=-1)
+        x = gamma_diff.unsqueeze(1) * x + beta_diff.unsqueeze(1)
+        lvl = level.unsqueeze(-1).float()  # (B, 1)
+        gamma_beta_lvl = self.film_level(lvl)  # (B, 2*D_MODEL)
+        gamma_lvl, beta_lvl = gamma_beta_lvl.chunk(2, dim=-1)
+        x = gamma_lvl.unsqueeze(1) * x + beta_lvl.unsqueeze(1)
+        # Conformer encoder
+        x, _ = self.encoder(x, lengths=lengths)
+        # Presence prediction
+        presence = self.presence_regressor(x)
+        return {"presence": presence, "lengths": lengths}
+if __name__ == "__main__":
+    model = TaikoConformer6().to(device=DEVICE)
+    print(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(f"{name}: {param.numel():,}")
+    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {params / 1e6:.2f}M")
+    batch_size = 4
+    mel_time_steps = 1024
+    input_mel = torch.randn(batch_size, 1, N_MELS, mel_time_steps).to(DEVICE)
+    conformer_lengths = torch.tensor(
+        [mel_time_steps] * batch_size, dtype=torch.long
+    ).to(DEVICE)
+    notes_per_second_input = torch.tensor([10.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )
+    difficulty_input = torch.tensor([1.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )  # Example difficulty
+    level_input = torch.tensor([5.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )  # Example level
+    output = model(
+        input_mel,
+        conformer_lengths,
+        notes_per_second_input,
+        difficulty_input,
+        level_input,
+    )
+    print("Output shapes:")
+    for key, value in output.items():
+        print(f"{key}: {value.shape}")

tc6/preprocess.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import math
+import torch
+import torch.nn as nn
+import torchaudio
+from torchaudio.transforms import FrequencyMasking
+from tja import parse_tja, PyParsingMode
+from .config import N_TYPES, SAMPLE_RATE, N_MELS, HOP_LENGTH, TIME_SUB
+from .model import TaikoConformer6
+mel_transform = torchaudio.transforms.MelSpectrogram(
+    sample_rate=SAMPLE_RATE,
+    n_mels=N_MELS,
+    hop_length=HOP_LENGTH,
+    n_fft=2048,
+)
+freq_mask = FrequencyMasking(freq_mask_param=15)
+def preprocess(example, difficulty="oni"):
+    wav_tensor = example["audio"]["array"]
+    sr = example["audio"]["sampling_rate"]
+    # 1) load & resample
+    if sr != SAMPLE_RATE:
+        wav_tensor = torchaudio.functional.resample(wav_tensor, sr, SAMPLE_RATE)
+    # normalize audio
+    wav_tensor = wav_tensor / (wav_tensor.abs().max() + 1e-8)
+    # add random Gaussian noise
+    if torch.rand(1).item() < 0.5:
+        wav_tensor = wav_tensor + 0.005 * torch.randn_like(wav_tensor)
+    # 2) mel: (1, N_MELS, T)
+    mel = mel_transform(wav_tensor).unsqueeze(0)
+    # apply SpecAugment
+    mel = freq_mask(mel)
+    _, _, T = mel.shape
+    # 3) build label sequence of length ceil(T / TIME_SUB)
+    T_sub = math.ceil(T / TIME_SUB)
+    # Initialize energy-based labels for Don, Ka, Drumroll
+    don_labels = torch.zeros(T_sub, dtype=torch.float32)
+    ka_labels = torch.zeros(T_sub, dtype=torch.float32)
+    drumroll_labels = torch.zeros(T_sub, dtype=torch.float32)
+    # Define exponential decay tail parameters
+    tail_length = 40  # number of frames for decay tail
+    decay_rate = 8.0  # decay rate parameter, adjust as needed
+    tail_kernel = torch.exp(
+        -torch.arange(0, tail_length, dtype=torch.float32) / decay_rate
+    )
+    fps = SAMPLE_RATE / HOP_LENGTH
+    num_valid_notes = 0
+    for onset in example[difficulty]:
+        typ, t_start, t_end, *_ = onset
+        # Assuming N_TYPES in config is appropriately set (e.g., 7 or more)
+        if typ < 1 or typ > N_TYPES:  # Filter out invalid types
+            continue
+        num_valid_notes += 1
+        exact_frame_start = t_start.item() * fps
+        # Type 1 and 3 are Don, Type 2 and 4 are Ka
+        if typ == 1 or typ == 3 or typ == 2 or typ == 4:
+            exact_hit_time_sub = exact_frame_start / TIME_SUB
+            current_labels = don_labels if (typ == 1 or typ == 3) else ka_labels
+            start_points_info = []
+            rounded_hit_time_sub = round(exact_hit_time_sub)
+            if (
+                abs(exact_hit_time_sub - rounded_hit_time_sub) < 1e-6
+            ):  # Tolerance for float precision
+                idx_single = int(rounded_hit_time_sub)
+                if 0 <= idx_single < T_sub:
+                    start_points_info.append({"idx": idx_single, "weight": 1.0})
+            else:
+                idx_floor = math.floor(exact_hit_time_sub)
+                idx_ceil = idx_floor + 1
+                frac = exact_hit_time_sub - idx_floor
+                weight_ceil = frac
+                weight_floor = 1.0 - frac
+                if weight_floor > 1e-6 and 0 <= idx_floor < T_sub:
+                    start_points_info.append({"idx": idx_floor, "weight": weight_floor})
+                if weight_ceil > 1e-6 and 0 <= idx_ceil < T_sub:
+                    start_points_info.append({"idx": idx_ceil, "weight": weight_ceil})
+            for point_info in start_points_info:
+                start_idx = point_info["idx"]
+                weight = point_info["weight"]
+                for k_idx, kernel_val in enumerate(tail_kernel):
+                    target_idx = start_idx + k_idx
+                    if 0 <= target_idx < T_sub:
+                        current_labels[target_idx] = max(
+                            current_labels[target_idx].item(),
+                            weight * kernel_val.item(),
+                        )
+        # Type 5, 6, 7 are Drumroll
+        elif typ >= 5 and typ <= 7:
+            exact_frame_end = t_end.item() * fps
+            exact_start_time_sub = exact_frame_start / TIME_SUB
+            exact_end_time_sub = exact_frame_end / TIME_SUB
+            # Improved drumroll body
+            body_loop_start_idx = math.floor(exact_start_time_sub)
+            body_loop_end_idx = math.ceil(exact_end_time_sub)
+            for dr_idx in range(body_loop_start_idx, body_loop_end_idx):
+                if 0 <= dr_idx < T_sub:
+                    drumroll_labels[dr_idx] = 1.0
+            # Improved drumroll tail (starts from exact_end_time_sub)
+            tail_start_points_info = []
+            rounded_end_time_sub = round(exact_end_time_sub)
+            if abs(exact_end_time_sub - rounded_end_time_sub) < 1e-6:
+                idx_single_tail = int(rounded_end_time_sub)
+                if 0 <= idx_single_tail < T_sub:
+                    tail_start_points_info.append(
+                        {"idx": idx_single_tail, "weight": 1.0}
+                    )
+            else:
+                idx_floor_tail = math.floor(exact_end_time_sub)
+                idx_ceil_tail = idx_floor_tail + 1
+                frac_tail = exact_end_time_sub - idx_floor_tail
+                weight_ceil_tail = frac_tail
+                weight_floor_tail = 1.0 - frac_tail
+                if weight_floor_tail > 1e-6 and 0 <= idx_floor_tail < T_sub:
+                    tail_start_points_info.append(
+                        {"idx": idx_floor_tail, "weight": weight_floor_tail}
+                    )
+                if weight_ceil_tail > 1e-6 and 0 <= idx_ceil_tail < T_sub:
+                    tail_start_points_info.append(
+                        {"idx": idx_ceil_tail, "weight": weight_ceil_tail}
+                    )
+            for point_info in tail_start_points_info:
+                start_idx = point_info["idx"]
+                weight = point_info["weight"]
+                for k_idx, kernel_val in enumerate(tail_kernel):
+                    target_idx = start_idx + k_idx
+                    if 0 <= target_idx < T_sub:
+                        drumroll_labels[target_idx] = max(
+                            drumroll_labels[target_idx].item(),
+                            weight * kernel_val.item(),
+                        )
+    duration_seconds = wav_tensor.shape[-1] / SAMPLE_RATE
+    nps = num_valid_notes / duration_seconds if duration_seconds > 0 else 0.0
+    parsed = parse_tja(example["tja"], mode=PyParsingMode.Full)
+    chart = next(
+        (chart for chart in parsed.charts if chart.course.lower() == difficulty), None
+    )
+    difficulty_id = (
+        0
+        if difficulty == "easy"
+        else (
+            1
+            if difficulty == "normal"
+            else 2 if difficulty == "hard" else 3 if difficulty == "oni" else 4
+        )  # Assuming 4 for edit/ura
+    )
+    level = chart.level if chart else 0
+    # --- CNN shape inference and label padding/truncation ---
+    # Simulate CNN to get output time length (T_cnn)
+    dummy_model = TaikoConformer6()
+    with torch.no_grad():
+        cnn_out = dummy_model.cnn(mel.unsqueeze(0))  # (1, C, F, T_cnn)
+        _, _, _, T_cnn = cnn_out.shape
+    # Pad or truncate labels to T_cnn
+    def pad_or_truncate(label, out_len):
+        if label.shape[0] < out_len:
+            pad = torch.zeros(out_len - label.shape[0], dtype=label.dtype)
+            return torch.cat([label, pad], dim=0)
+        else:
+            return label[:out_len]
+    don_labels = pad_or_truncate(don_labels, T_cnn)
+    ka_labels = pad_or_truncate(ka_labels, T_cnn)
+    drumroll_labels = pad_or_truncate(drumroll_labels, T_cnn)
+    # For conformer input lengths: based on original mel shape (before CNN)
+    conformer_input_length = min(math.ceil(T / TIME_SUB), T_cnn)
+    print(
+        f"Processed {num_valid_notes} notes in {duration_seconds:.2f} seconds, NPS: {nps:.2f}, Difficulty: {difficulty_id}, Level: {level}"
+    )
+    return {
+        "mel": mel,  # (1, N_MELS, T)
+        "don_labels": don_labels,  # (T_cnn,)
+        "ka_labels": ka_labels,  # (T_cnn,)
+        "drumroll_labels": drumroll_labels,  # (T_cnn,)
+        "nps": torch.tensor(nps, dtype=torch.float32),
+        "difficulty": torch.tensor(difficulty_id, dtype=torch.long),
+        "level": torch.tensor(level, dtype=torch.long),
+        "duration_seconds": torch.tensor(duration_seconds, dtype=torch.float32),
+        "length": torch.tensor(
+            conformer_input_length, dtype=torch.long
+        ),  # for conformer
+    }
+def collate_fn(batch):
+    mels_list = [b["mel"].squeeze(0).transpose(0, 1) for b in batch]  # (T, N_MELS)
+    don_labels_list = [b["don_labels"] for b in batch]
+    ka_labels_list = [b["ka_labels"] for b in batch]
+    drumroll_labels_list = [b["drumroll_labels"] for b in batch]
+    nps_list = [b["nps"] for b in batch]
+    difficulty_list = [b["difficulty"] for b in batch]
+    level_list = [b["level"] for b in batch]
+    durations_list = [b["duration_seconds"] for b in batch]
+    lengths_list = [b["length"] for b in batch]
+    # Pad mels
+    padded_mels = nn.utils.rnn.pad_sequence(
+        mels_list, batch_first=True
+    )  # (B, T_max, N_MELS)
+    reshaped_mels = padded_mels.transpose(1, 2).unsqueeze(1)
+    T_max = padded_mels.shape[1]
+    # Pad labels to T_max
+    def pad_label(label, out_len):
+        if label.shape[0] < out_len:
+            pad = torch.zeros(out_len - label.shape[0], dtype=label.dtype)
+            return torch.cat([label, pad], dim=0)
+        else:
+            return label[:out_len]
+    don_labels = torch.stack([pad_label(l, T_max) for l in don_labels_list])
+    ka_labels = torch.stack([pad_label(l, T_max) for l in ka_labels_list])
+    drumroll_labels = torch.stack([pad_label(l, T_max) for l in drumroll_labels_list])
+    lengths = torch.tensor(
+        [min(l.item(), T_max) for l in lengths_list], dtype=torch.long
+    )
+    return {
+        "mel": reshaped_mels,
+        "don_labels": don_labels,
+        "ka_labels": ka_labels,
+        "drumroll_labels": drumroll_labels,
+        "lengths": lengths,  # for conformer
+        "nps": torch.stack(nps_list),
+        "difficulty": torch.stack(difficulty_list),
+        "level": torch.stack(level_list),
+        "durations": torch.stack(durations_list),
+    }

tc6/train.py ADDED Viewed

	@@ -0,0 +1,336 @@

+from accelerate.utils import set_seed
+set_seed(1024)
+import math
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from datasets import concatenate_datasets
+import matplotlib.pyplot as plt
+import numpy as np
+from .config import (
+    BATCH_SIZE,
+    DEVICE,
+    EPOCHS,
+    LR,
+    GRAD_ACCUM_STEPS,
+    HOP_LENGTH,
+    SAMPLE_RATE,
+)
+from .model import TaikoConformer6
+from .dataset import ds
+from .preprocess import preprocess, collate_fn
+from .loss import TaikoEnergyLoss
+from huggingface_hub import upload_folder
+# --- Helper function to log energy plots ---
+def log_energy_plots_to_tensorboard(
+    writer,
+    tag_prefix,
+    epoch,
+    pred_don,
+    pred_ka,
+    pred_drumroll,
+    true_don,
+    true_ka,
+    true_drumroll,
+    valid_length,  # Actual valid length of the sequence (before padding)
+    hop_sec,
+):
+    """
+    Logs a plot of predicted vs. true energies for one sample to TensorBoard.
+    Energies should be 1D numpy arrays for the single sample, up to valid_length.
+    """
+    # Ensure data is on CPU and converted to numpy, and select only the valid part
+    pred_don = pred_don[:valid_length].detach().cpu().numpy()
+    pred_ka = pred_ka[:valid_length].detach().cpu().numpy()
+    pred_drumroll = pred_drumroll[:valid_length].detach().cpu().numpy()
+    true_don = true_don[:valid_length].cpu().numpy()
+    true_ka = true_ka[:valid_length].cpu().numpy()
+    true_drumroll = true_drumroll[:valid_length].cpu().numpy()
+    time_axis = np.arange(valid_length) * hop_sec
+    fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
+    fig.suptitle(f"{tag_prefix} - Epoch {epoch}", fontsize=16)
+    axs[0].plot(time_axis, true_don, label="True Don", color="blue", linestyle="--")
+    axs[0].plot(time_axis, pred_don, label="Pred Don", color="lightblue", alpha=0.8)
+    axs[0].set_ylabel("Don Energy")
+    axs[0].legend()
+    axs[0].grid(True)
+    axs[1].plot(time_axis, true_ka, label="True Ka", color="red", linestyle="--")
+    axs[1].plot(time_axis, pred_ka, label="Pred Ka", color="lightcoral", alpha=0.8)
+    axs[1].set_ylabel("Ka Energy")
+    axs[1].legend()
+    axs[1].grid(True)
+    axs[2].plot(
+        time_axis, true_drumroll, label="True Drumroll", color="green", linestyle="--"
+    )
+    axs[2].plot(
+        time_axis, pred_drumroll, label="Pred Drumroll", color="lightgreen", alpha=0.8
+    )
+    axs[2].set_ylabel("Drumroll Energy")
+    axs[2].set_xlabel("Time (s)")
+    axs[2].legend()
+    axs[2].grid(True)
+    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout to make space for suptitle
+    writer.add_figure(f"{tag_prefix}/Energy_Comparison", fig, epoch)
+    plt.close(fig)
+def main():
+    global ds
+    # Calculate hop seconds for model output frames
+    # This assumes the model output time dimension corresponds to the mel spectrogram time dimension
+    output_frame_hop_sec = HOP_LENGTH / SAMPLE_RATE
+    best_val_loss = float("inf")
+    patience = 10  # Increased patience a bit
+    pat_count = 0
+    ds_oni = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "oni"},
+        writer_batch_size=10,
+    )
+    ds_hard = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "hard"},
+        writer_batch_size=10,
+    )
+    ds_normal = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "normal"},
+        writer_batch_size=10,
+    )
+    ds = concatenate_datasets([ds_oni, ds_hard, ds_normal])
+    ds_train_test = ds.train_test_split(test_size=0.1, seed=42)
+    # ds_train_test.push_to_hub("JacobLinCool/taiko-conformer-6-ds")
+    train_loader = DataLoader(
+        ds_train_test["train"],
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=16,
+        persistent_workers=True,
+        prefetch_factor=4,
+    )
+    val_loader = DataLoader(
+        ds_train_test["test"],
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        collate_fn=collate_fn,
+        num_workers=16,
+        persistent_workers=True,
+        prefetch_factor=4,
+    )
+    model = TaikoConformer6().to(DEVICE)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
+    criterion = TaikoEnergyLoss(reduction="mean").to(DEVICE)
+    # Adjust scheduler steps for gradient accumulation
+    num_optimizer_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
+    total_optimizer_steps = EPOCHS * num_optimizer_steps_per_epoch
+    scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optimizer, max_lr=LR, total_steps=total_optimizer_steps
+    )
+    writer = SummaryWriter()
+    for epoch in range(1, EPOCHS + 1):
+        model.train()
+        total_epoch_loss = 0.0
+        optimizer.zero_grad()
+        for idx, batch in enumerate(tqdm(train_loader, desc=f"Train Epoch {epoch}")):
+            mel = batch["mel"].to(DEVICE)
+            # Unpack new energy-based labels
+            don_labels = batch["don_labels"].to(DEVICE)
+            ka_labels = batch["ka_labels"].to(DEVICE)
+            drumroll_labels = batch["drumroll_labels"].to(DEVICE)
+            lengths = batch["lengths"].to(
+                DEVICE
+            )  # These are for the Conformer model output
+            nps = batch["nps"].to(DEVICE)
+            difficulty = batch["difficulty"].to(DEVICE)  # Add difficulty
+            level = batch["level"].to(DEVICE)  # Add level
+            output_dict = model(
+                mel, lengths, nps, difficulty, level
+            )  # Pass difficulty and level
+            # output_dict["presence"] is now (B, T_out, 3) for don, ka, drumroll energies
+            pred_energies_batch = output_dict["presence"]  # (B, T_out, 3)
+            loss_input_batch = {
+                "don_labels": don_labels,
+                "ka_labels": ka_labels,
+                "drumroll_labels": drumroll_labels,
+                "lengths": lengths,  # Pass lengths for masking within the loss function
+            }
+            loss = criterion(output_dict, loss_input_batch)
+            (loss / GRAD_ACCUM_STEPS).backward()
+            if (idx + 1) % GRAD_ACCUM_STEPS == 0 or (idx + 1) == len(train_loader):
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            total_epoch_loss += loss.item()
+            # Log plot for the first sample of the first batch in each training epoch
+            if idx == 0:
+                first_sample_pred_don = pred_energies_batch[0, :, 0]
+                first_sample_pred_ka = pred_energies_batch[0, :, 1]
+                first_sample_pred_drumroll = pred_energies_batch[0, :, 2]
+                first_sample_true_don = don_labels[0, :]
+                first_sample_true_ka = ka_labels[0, :]
+                first_sample_true_drumroll = drumroll_labels[0, :]
+                first_sample_length = lengths[
+                    0
+                ].item()  # Get the valid length of the first sample
+                log_energy_plots_to_tensorboard(
+                    writer,
+                    "Train/Sample_0",
+                    epoch,
+                    first_sample_pred_don,
+                    first_sample_pred_ka,
+                    first_sample_pred_drumroll,
+                    first_sample_true_don,
+                    first_sample_true_ka,
+                    first_sample_true_drumroll,
+                    first_sample_length,
+                    output_frame_hop_sec,
+                )
+        avg_train_loss = total_epoch_loss / len(train_loader)
+        writer.add_scalar("Loss/Train_Avg", avg_train_loss, epoch)
+        # Validation
+        model.eval()
+        total_val_loss = 0.0
+        # Removed storage for classification logits/labels and confusion matrix components
+        with torch.no_grad():
+            for val_idx, batch in enumerate(
+                tqdm(val_loader, desc=f"Val Epoch {epoch}")
+            ):
+                mel = batch["mel"].to(DEVICE)
+                don_labels = batch["don_labels"].to(DEVICE)
+                ka_labels = batch["ka_labels"].to(DEVICE)
+                drumroll_labels = batch["drumroll_labels"].to(DEVICE)
+                lengths = batch["lengths"].to(DEVICE)
+                nps = batch["nps"].to(DEVICE)  # Ground truth NPS from batch
+                difficulty = batch["difficulty"].to(DEVICE)  # Add difficulty
+                level = batch["level"].to(DEVICE)  # Add level
+                output_dict = model(
+                    mel, lengths, nps, difficulty, level
+                )  # Pass difficulty and level
+                pred_energies_val_batch = output_dict["presence"]  # (B, T_out, 3)
+                val_loss_input_batch = {
+                    "don_labels": don_labels,
+                    "ka_labels": ka_labels,
+                    "drumroll_labels": drumroll_labels,
+                    "lengths": lengths,
+                }
+                val_loss = criterion(output_dict, val_loss_input_batch)
+                total_val_loss += val_loss.item()
+                # Log plot for the first sample of the first batch in each validation epoch
+                if val_idx == 0:
+                    first_val_sample_pred_don = pred_energies_val_batch[0, :, 0]
+                    first_val_sample_pred_ka = pred_energies_val_batch[0, :, 1]
+                    first_val_sample_pred_drumroll = pred_energies_val_batch[0, :, 2]
+                    first_val_sample_true_don = don_labels[0, :]
+                    first_val_sample_true_ka = ka_labels[0, :]
+                    first_val_sample_true_drumroll = drumroll_labels[0, :]
+                    first_val_sample_length = lengths[0].item()
+                    log_energy_plots_to_tensorboard(
+                        writer,
+                        "Eval/Sample_0",
+                        epoch,
+                        first_val_sample_pred_don,
+                        first_val_sample_pred_ka,
+                        first_val_sample_pred_drumroll,
+                        first_val_sample_true_don,
+                        first_val_sample_true_ka,
+                        first_val_sample_true_drumroll,
+                        first_val_sample_length,
+                        output_frame_hop_sec,
+                    )
+                # Log ground truth NPS for reference during validation if needed
+                # writer.add_scalar("NPS/GT_Val_Batch_Avg", nps.mean().item(), epoch * len(val_loader) + idx)
+        avg_val_loss = total_val_loss / len(val_loader)
+        writer.add_scalar("Loss/Val_Avg", avg_val_loss, epoch)
+        # Log learning rate
+        current_lr = optimizer.param_groups[0]["lr"]
+        writer.add_scalar("LR/learning_rate", current_lr, epoch)
+        # Log ground truth NPS from the last validation batch (or mean over epoch)
+        if "nps" in batch:  # Check if nps is in the last batch
+            writer.add_scalar(
+                "NPS/GT_Val_LastBatch_Avg", batch["nps"].mean().item(), epoch
+            )
+        print(
+            f"Epoch {epoch:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | LR: {current_lr:.2e}"
+        )
+        if avg_val_loss < best_val_loss:
+            best_val_loss = avg_val_loss
+            pat_count = 0
+            torch.save(model.state_dict(), "best_model.pt")  # Changed model save name
+            print(f"Saved new best model to best_model.pt at epoch {epoch}")
+        else:
+            pat_count += 1
+            if pat_count >= patience:
+                print("Early stopping!")
+                break
+    writer.close()
+    model_id = "JacobLinCool/taiko-conformer-6"
+    try:
+        model.push_to_hub(model_id, commit_message="Upload trained model")
+        upload_folder(
+            repo_id=model_id,
+            folder_path="runs",
+            path_in_repo=".",
+            commit_message="Upload training logs",
+            ignore_patterns=["*.txt", "*.json", "*.csv"],
+        )
+        print(f"Model and logs uploaded to {model_id}")
+    except Exception as e:
+        print(f"Error uploading to Hugging Face Hub: {e}")
+        print("Make sure you have the correct permissions and try again.")
+if __name__ == "__main__":
+    main()

tc7/__init__.py ADDED Viewed

File without changes

tc7/config.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+# ─── 1) CONFIG ─────────────────────────────────────────────────────
+SAMPLE_RATE = 22050
+N_MELS = 80
+HOP_LENGTH = 256
+TIME_SUB = 1
+CNN_CH = 256
+N_HEADS = 8
+D_MODEL = 512
+FF_DIM = 1024
+N_LAYERS = 6
+DEPTHWISE_CONV_KERNEL_SIZE = 31
+DROPOUT = 0.1
+HIDDEN_DIM = 64
+N_TYPES = 7
+BATCH_SIZE = 2
+GRAD_ACCUM_STEPS = 8
+LR = 3e-4
+EPOCHS = 200
+NPS_PENALTY_WEIGHT_ALPHA = 0.3
+NPS_PENALTY_WEIGHT_BETA = 1.0
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available() else "cpu"
+)

tc7/dataset.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from datasets import load_dataset, concatenate_datasets
+ds1 = load_dataset("JacobLinCool/taiko-2023-1.1", split="train")
+ds2 = load_dataset("JacobLinCool/taiko-2023-1.2", split="train")
+ds3 = load_dataset("JacobLinCool/taiko-2023-1.3", split="train")
+ds4 = load_dataset("JacobLinCool/taiko-2023-1.4", split="train")
+ds5 = load_dataset("JacobLinCool/taiko-2023-1.5", split="train")
+ds6 = load_dataset("JacobLinCool/taiko-2023-1.6", split="train")
+ds7 = load_dataset("JacobLinCool/taiko-2023-1.7", split="train")
+ds = concatenate_datasets([ds1, ds2, ds3, ds4, ds5, ds6, ds7]).with_format("torch")
+good = list(range(len(ds)))
+good.remove(1079)  # 1079 has file problem
+ds = ds.select(good)
+# for local test
+# ds = (
+#     load_dataset("JacobLinCool/taiko-2023-1.6", split="train")
+#     .with_format("torch")
+#     .select(range(10))
+# )

tc7/infer.py ADDED Viewed

	@@ -0,0 +1,354 @@

+import time
+import torch
+import torchaudio
+import matplotlib.pyplot as plt
+import numpy as np
+from .config import SAMPLE_RATE, N_MELS, HOP_LENGTH
+import torch.profiler
+# --- PREPROCESSING (match training) ---
+def preprocess_audio(audio_path):
+    wav, sr = torchaudio.load(audio_path)
+    wav = wav.mean(dim=0)  # mono
+    if sr != SAMPLE_RATE:
+        wav = torchaudio.functional.resample(wav, sr, SAMPLE_RATE)
+    wav = wav / (wav.abs().max() + 1e-8)  # Normalize audio
+    mel_transform = torchaudio.transforms.MelSpectrogram(
+        sample_rate=SAMPLE_RATE,
+        n_mels=N_MELS,
+        hop_length=HOP_LENGTH,
+        n_fft=2048,
+    )
+    mel = mel_transform(wav)
+    return mel  # mel is (N_MELS, T_mel)
+# --- INFERENCE ---
+def run_inference(model, mel_input, nps_input, difficulty_input, level_input, device):
+    model.eval()
+    with torch.no_grad():
+        mel = mel_input.to(device).unsqueeze(0)  # (1, N_MELS, T_mel)
+        nps = nps_input.to(device).unsqueeze(0)  # (1,)
+        difficulty = difficulty_input.to(device).unsqueeze(0)  # (1,)
+        level = level_input.to(device).unsqueeze(0)  # (1,)
+        mel_cnn_input = mel.unsqueeze(1)  # (1, 1, N_MELS, T_mel)
+        conformer_lengths = torch.tensor(
+            [mel_cnn_input.shape[-1]], dtype=torch.long, device=device
+        )
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                *(
+                    [torch.profiler.ProfilerActivity.CUDA]
+                    if device.type == "cuda"
+                    else []
+                ),
+            ],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=False,
+            with_flops=True,
+        ) as prof:
+            out_dict = model(mel_cnn_input, conformer_lengths, nps, difficulty, level)
+        print(
+            prof.key_averages().table(
+                sort_by=(
+                    "self_cuda_memory_usage"
+                    if device.type == "cuda"
+                    else "self_cpu_time_total"
+                ),
+                row_limit=20,
+            )
+        )
+        energies = out_dict["presence"].squeeze(0).cpu().numpy()
+        don_energy = energies[:, 0]
+        ka_energy = energies[:, 1]
+        drumroll_energy = energies[:, 2]
+    return don_energy, ka_energy, drumroll_energy
+# --- DECODE TO ONSETS ---
+def decode_onsets(
+    don_energy,
+    ka_energy,
+    drumroll_energy,
+    hop_sec,
+    threshold=0.5,
+    min_distance_frames=3,
+):
+    results = []
+    T_out = len(don_energy)
+    last_onset_frame = -min_distance_frames
+    for i in range(1, T_out - 1):  # Iterate considering neighbors for peak detection
+        if i < last_onset_frame + min_distance_frames:
+            continue
+        e_don, e_ka, e_drum = don_energy[i], ka_energy[i], drumroll_energy[i]
+        energies_at_i = {
+            1: e_don,
+            2: e_ka,
+            5: e_drum,
+        }  # Type mapping: 1:Don, 2:Ka, 5:Drumroll
+        # Find which energy is max and if it's a peak above threshold
+        # Sort by energy value descending to prioritize higher energy in case of ties for peak condition
+        sorted_types_by_energy = sorted(
+            energies_at_i.keys(), key=lambda x: energies_at_i[x], reverse=True
+        )
+        detected_this_frame = False
+        for onset_type in sorted_types_by_energy:
+            current_energy_series = None
+            if onset_type == 1:
+                current_energy_series = don_energy
+            elif onset_type == 2:
+                current_energy_series = ka_energy
+            elif onset_type == 5:
+                current_energy_series = drumroll_energy
+            energy_val = current_energy_series[i]
+            if (
+                energy_val > threshold
+                and energy_val > current_energy_series[i - 1]
+                and energy_val > current_energy_series[i + 1]
+            ):
+                # Check if this energy is the highest among the three at this frame
+                # This check is implicitly handled by iterating `sorted_types_by_energy`
+                # and breaking after the first detection.
+                results.append((i * hop_sec, onset_type))
+                last_onset_frame = i
+                detected_this_frame = True
+                break  # Only one onset type per frame
+    return results
+# --- VISUALIZATION ---
+def plot_results(
+    mel_spectrogram,
+    don_energy,
+    ka_energy,
+    drumroll_energy,
+    onsets,
+    hop_sec,
+    out_path=None,
+):
+    # mel_spectrogram is (N_MELS, T_mel)
+    T_mel = mel_spectrogram.shape[1]
+    T_out = len(don_energy)  # Length of energy arrays (model output time dimension)
+    # Time axes
+    time_axis_mel = np.arange(T_mel) * (HOP_LENGTH / SAMPLE_RATE)
+    # hop_sec for model output is (HOP_LENGTH * TIME_SUB) / SAMPLE_RATE
+    # However, the model output T_out is related to T_mel (input to CNN).
+    # If CNN does not change time dimension, T_out = T_mel.
+    # If TIME_SUB is used for label generation, T_out = T_mel / TIME_SUB.
+    # The `lengths` passed to conformer in `run_inference` is T_mel.
+    # The output of conformer `x` has shape (B, T, D_MODEL). This T is `lengths`.
+    # So, T_out from model is T_mel.
+    # The `hop_sec` for onsets should be based on the model output frame rate.
+    # If model output T_out corresponds to T_mel, then hop_sec for plotting energies is HOP_LENGTH / SAMPLE_RATE.
+    # The `hop_sec` passed to `decode_onsets` is (HOP_LENGTH * TIME_SUB) / SAMPLE_RATE.
+    # This seems inconsistent. Let's clarify: `TIME_SUB` is used in `preprocess.py` to determine `T_sub` for labels.
+    # The model's CNN output time dimension T_cnn is used to pad/truncate labels in `collate_fn`.
+    # In `model.py`, the CNN does not stride in time. So T_cnn_out = T_mel_input_to_CNN.
+    # The `lengths` for the conformer is based on this T_cnn_out.
+    # So, the output of the regressor `presence` (B, T_cnn_out, 3) has T_cnn_out time steps.
+    # Each step corresponds to `HOP_LENGTH / SAMPLE_RATE` seconds if TIME_SUB is not involved in downsampling features for the conformer.
+    # Let's assume the `hop_sec` used for `decode_onsets` is correct for interpreting model output frames.
+    time_axis_energies = np.arange(T_out) * hop_sec
+    fig, ax1 = plt.subplots(figsize=(100, 10))
+    # Plot Mel Spectrogram on ax1
+    mel_db = torchaudio.functional.amplitude_to_DB(
+        mel_spectrogram, multiplier=10.0, amin=1e-10, db_multiplier=0.0
+    )
+    img = ax1.imshow(
+        mel_db.numpy(),
+        aspect="auto",
+        origin="lower",
+        cmap="magma",
+        extent=[time_axis_mel[0], time_axis_mel[-1], 0, N_MELS],
+    )
+    ax1.set_title("Mel Spectrogram with Predicted Energies and Onsets")
+    ax1.set_xlabel("Time (s)")
+    ax1.set_ylabel("Mel Bin")
+    fig.colorbar(img, ax=ax1, format="%+2.0f dB")
+    # Create a second y-axis for energies
+    ax2 = ax1.twinx()
+    ax2.plot(time_axis_energies, don_energy, label="Don Energy", color="red")
+    ax2.plot(time_axis_energies, ka_energy, label="Ka Energy", color="blue")
+    ax2.plot(
+        time_axis_energies, drumroll_energy, label="Drumroll Energy", color="green"
+    )
+    ax2.set_ylabel("Energy")
+    ax2.set_ylim(0, 1.2)  # Assuming energies are somewhat normalized or bounded
+    # Overlay onsets from decode_onsets (t is already in seconds)
+    labeled_types = set()
+    # Group drumrolls into segments (reuse logic from write_tja)
+    drumroll_times = [t_sec for t_sec, typ in onsets if typ == 5]
+    drumroll_times.sort()
+    drumroll_segments = []
+    if drumroll_times:
+        seg_start = drumroll_times[0]
+        prev = drumroll_times[0]
+        for t in drumroll_times[1:]:
+            if t - prev <= hop_sec * 6:  # up to 5-frame gap
+                prev = t
+            else:
+                drumroll_segments.append((seg_start, prev))
+                seg_start = t
+                prev = t
+        drumroll_segments.append((seg_start, prev))
+    # Plot Don/Ka onsets as vertical lines
+    for t_sec, typ in onsets:
+        if typ == 5:
+            continue  # skip drumroll onsets
+        color_map = {1: "darkred", 2: "darkblue"}
+        label_map = {1: "Don Onset", 2: "Ka Onset"}
+        line_color = color_map.get(typ, "black")
+        line_label = label_map.get(typ, f"Type {typ} Onset")
+        if typ not in labeled_types:
+            ax1.axvline(
+                t_sec, color=line_color, linestyle="--", alpha=0.9, label=line_label
+            )
+            labeled_types.add(typ)
+        else:
+            ax1.axvline(t_sec, color=line_color, linestyle="--", alpha=0.9)
+    # Plot drumroll segments as shaded regions
+    for seg_start, seg_end in drumroll_segments:
+        ax1.axvspan(
+            seg_start,
+            seg_end + hop_sec,
+            color="green",
+            alpha=0.2,
+            label="Drumroll Segment" if "drumroll" not in labeled_types else None,
+        )
+        labeled_types.add("drumroll")
+    # Combine legends from both axes
+    lines, labels = ax1.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax2.legend(lines + lines2, labels + labels2, loc="upper right")
+    fig.tight_layout()
+    # Return plot as image buffer or save to file if path provided
+    if out_path:
+        plt.savefig(out_path)
+        print(f"Saved plot to {out_path}")
+        plt.close(fig)
+        return out_path
+    else:
+        # Return plot as in-memory buffer
+        return fig
+def write_tja(onsets, out_path=None, bpm=160, quantize=96, audio="audio.wav"):
+    # TJA types: 0:no note, 1:Don, 2:Ka, 3:BigDon, 4:BigKa, 5:DrumrollStart, 8:DrumrollEnd
+    # Model output types: 1:Don, 2:Ka, 5:Drumroll (interpreted as start/single)
+    sec_per_beat = 60 / bpm
+    beats_per_measure = 4  # Assuming 4/4 time signature
+    sec_per_measure = sec_per_beat * beats_per_measure
+    # Step 1: Map onsets to (measure_idx, slot, typ)
+    slot_events = []
+    for t, typ in onsets:
+        measure_idx = int(t // sec_per_measure)
+        t_in_measure = t % sec_per_measure
+        slot = int(round(t_in_measure / sec_per_measure * quantize))
+        if slot >= quantize:
+            slot = quantize - 1
+        slot_events.append((measure_idx, slot, typ))
+    # Step 2: Build measure/slot grid
+    if slot_events:
+        max_measure_idx = max(m for m, _, _ in slot_events)
+    else:
+        max_measure_idx = -1
+    measures = {i: [0] * quantize for i in range(max_measure_idx + 1)}
+    # Step 3: Place Don/Ka, collect drumrolls
+    drumroll_slots = set()
+    for m, s, typ in slot_events:
+        if typ in [1, 2]:
+            measures[m][s] = typ
+        elif typ == 5:
+            drumroll_slots.add((m, s))
+    # Step 4: Process drumrolls into contiguous regions, mark 5 (start) and 8 (end)
+    # Flatten all slots to a list of (measure, slot) sorted
+    drumroll_list = sorted(list(drumroll_slots))
+    # Group into contiguous regions (allowing a gap of 5 slots)
+    grouped = []
+    group = []
+    for ms in drumroll_list:
+        if not group:
+            group = [ms]
+        else:
+            last_m, last_s = group[-1]
+            m, s = ms
+            # Calculate slot distance, considering measure wrap
+            slot_dist = None
+            if m == last_m:
+                slot_dist = s - last_s
+            elif m == last_m + 1 and last_s <= quantize - 1:
+                slot_dist = (quantize - 1 - last_s) + s + 1
+            else:
+                slot_dist = None
+            # Allow gap of up to 5 slots (slot_dist <= 6)
+            if slot_dist is not None and 1 <= slot_dist <= 6:
+                group.append(ms)
+            else:
+                grouped.append(group)
+                group = [ms]
+    if group:
+        grouped.append(group)
+    # Mark 5 (start) and 8 (end) for each group
+    for region in grouped:
+        if len(region) == 1:
+            m, s = region[0]
+            measures[m][s] = 5
+            # Place 8 in next slot (or next measure if at end)
+            if s < quantize - 1:
+                measures[m][s + 1] = 8
+            elif m < max_measure_idx:
+                measures[m + 1][0] = 8
+        else:
+            m_start, s_start = region[0]
+            m_end, s_end = region[-1]
+            measures[m_start][s_start] = 5
+            measures[m_end][s_end] = 8
+            # Fill 0 for middle slots (already 0 by default)
+    # Step 5: Generate TJA content
+    tja_content = []
+    tja_content.append(f"TITLE:{audio} (TC7, {time.strftime('%Y-%m-%d %H:%M:%S')})")
+    tja_content.append(f"BPM:{bpm}")
+    tja_content.append(f"WAVE:{audio}")
+    tja_content.append("OFFSET:0")
+    tja_content.append("COURSE:Oni\nLEVEL:9\n")
+    tja_content.append("#START")
+    for i in range(max_measure_idx + 1):
+        notes = measures.get(i, [0] * quantize)
+        line = "".join(str(n) for n in notes)
+        tja_content.append(line + ",")
+    tja_content.append("#END")
+    tja_string = "\n".join(tja_content)
+    # If out_path is provided, also write to file
+    if out_path:
+        with open(out_path, "w", encoding="utf-8") as f:
+            f.write(tja_string)
+        print(f"TJA chart saved to {out_path}")
+    return tja_string

tc7/loss.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import torch.nn as nn
+class TaikoLoss(nn.Module):
+    def __init__(
+        self,
+        reduction="mean",
+        nps_penalty_weight_alpha=0.3,
+        nps_penalty_weight_beta=1.0,
+    ):
+        super().__init__()
+        self.mse_loss = nn.MSELoss(reduction="none")
+        self.reduction = reduction
+        self.nps_penalty_weight_alpha = nps_penalty_weight_alpha
+        self.nps_penalty_weight_beta = nps_penalty_weight_beta
+    def forward(self, outputs, batch):
+        """
+        Calculates the MSE loss for energy-based predictions, with a two-level penalty
+        based on sliding NPS values.
+        - A heavier penalty if sliding_nps is 0.
+        - A continuous penalty if sliding_nps > 0.
+        Args:
+            outputs (dict): Model output, containing 'presence' tensor.
+                            outputs['presence'] shape: (B, T, 3) for don, ka, drumroll energies.
+            batch (dict): Batch data from collate_fn, containing true labels, lengths,
+                          and sliding_nps_labels.
+                          batch['sliding_nps_labels'] shape: (B, T)
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        pred_energies = outputs["presence"]  # (B, T, 3)
+        true_don = batch["don_labels"]  # (B, T)
+        true_ka = batch["ka_labels"]  # (B, T)
+        true_drumroll = batch["drumroll_labels"]  # (B, T)
+        true_energies = torch.stack([true_don, true_ka, true_drumroll], dim=2).to(
+            pred_energies.device
+        )  # (B, T, 3)
+        B, T, _ = pred_energies.shape
+        # Create a mask based on batch['lengths'] to ignore padded parts of sequences
+        # batch['lengths'] gives the actual length of each sequence in the batch
+        # mask shape: (B, T)
+        mask_2d = torch.arange(T, device=pred_energies.device).expand(B, T) < batch[
+            "lengths"
+        ].to(pred_energies.device).unsqueeze(1)
+        # Expand mask to (B, T, 1) to broadcast across the 3 energy channels
+        mask_3d = mask_2d.unsqueeze(2)  # (B, T, 1)
+        # Calculate element-wise MSE loss
+        mse_loss_elementwise = self.mse_loss(pred_energies, true_energies)  # (B, T, 3)
+        # Calculate two-level Sliding NPS penalty
+        sliding_nps = batch["sliding_nps_labels"].to(pred_energies.device)  # (B, T)
+        penalty_coefficients = torch.zeros_like(sliding_nps)  # (B, T)
+        is_zero_nps = sliding_nps == 0.0
+        is_not_zero_nps = ~is_zero_nps
+        # Apply heavy penalty where sliding_nps is 0
+        penalty_coefficients[is_zero_nps] = self.nps_penalty_weight_beta
+        # Apply continuous penalty where sliding_nps > 0
+        penalty_coefficients[is_not_zero_nps] = self.nps_penalty_weight_alpha * (
+            1 - sliding_nps[is_not_zero_nps]
+        )
+        # Apply penalty factor to the MSE loss
+        loss_elementwise = mse_loss_elementwise * (
+            1 + penalty_coefficients.unsqueeze(2)
+        )
+        # Apply the mask to the combined loss
+        masked_loss = loss_elementwise * mask_3d
+        if self.reduction == "mean":
+            # Sum the loss over all valid (unmasked) elements and divide by the number of valid elements
+            total_loss = masked_loss.sum()
+            num_valid_elements = mask_3d.sum()  # Total number of unmasked float values
+            if num_valid_elements > 0:
+                return total_loss / num_valid_elements
+            else:
+                # Avoid division by zero if there are no valid elements (e.g., empty batch or all lengths are 0)
+                return torch.tensor(
+                    0.0, device=pred_energies.device, requires_grad=True
+                )
+        elif self.reduction == "sum":
+            return masked_loss.sum()
+        else:  # 'none' or any other case
+            return masked_loss

tc7/model.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+import torch.nn as nn
+from torchaudio.models import Conformer
+from huggingface_hub import PyTorchModelHubMixin
+from .config import (
+    N_MELS,
+    CNN_CH,
+    N_HEADS,
+    D_MODEL,
+    FF_DIM,
+    N_LAYERS,
+    DROPOUT,
+    DEPTHWISE_CONV_KERNEL_SIZE,
+    HIDDEN_DIM,
+    DEVICE,
+)
+class TaikoConformer7(nn.Module, PyTorchModelHubMixin):
+    def __init__(self):
+        super().__init__()
+        # 1) CNN frontend: frequency-only pooling
+        self.cnn = nn.Sequential(
+            nn.Conv2d(1, CNN_CH, 3, stride=(2, 1), padding=1),
+            nn.BatchNorm2d(CNN_CH),
+            nn.GELU(),
+            nn.Dropout2d(DROPOUT),
+            nn.Conv2d(CNN_CH, CNN_CH, 3, stride=(2, 1), padding=1),
+            nn.BatchNorm2d(CNN_CH),
+            nn.GELU(),
+            nn.Dropout2d(DROPOUT),
+        )
+        feat_dim = CNN_CH * (N_MELS // 4)
+        # 2) Linear projection to model dimension
+        self.proj = nn.Linear(feat_dim, D_MODEL)
+        # 3) FiLM conditioning for notes_per_second, difficulty, and level
+        self.film_nps = nn.Linear(1, 2 * D_MODEL)
+        self.film_difficulty = nn.Linear(
+            1, 2 * D_MODEL
+        )  # Assuming difficulty is a single scalar
+        self.film_level = nn.Linear(1, 2 * D_MODEL)  # Assuming level is a single scalar
+        # 4) Conformer encoder
+        self.encoder = Conformer(
+            input_dim=D_MODEL,
+            num_heads=N_HEADS,
+            ffn_dim=FF_DIM,
+            num_layers=N_LAYERS,
+            depthwise_conv_kernel_size=DEPTHWISE_CONV_KERNEL_SIZE,
+            dropout=DROPOUT,
+            use_group_norm=False,
+            convolution_first=False,
+        )
+        # 5) Presence regressor head
+        self.presence_regressor = nn.Sequential(
+            nn.Dropout(DROPOUT),
+            nn.Linear(D_MODEL, HIDDEN_DIM),
+            nn.GELU(),
+            nn.Dropout(DROPOUT),
+            nn.Linear(HIDDEN_DIM, 3),  # Don, Ka, DrumRoll energy
+            nn.Sigmoid(),  # Output between 0 and 1
+        )
+        # 6) Initialize weights
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def forward(
+        self,
+        mel: torch.Tensor,
+        lengths: torch.Tensor,
+        notes_per_second: torch.Tensor,
+        difficulty: torch.Tensor,
+        level: torch.Tensor,
+    ):
+        """
+        Args:
+            mel: (B, 1, N_MELS, T_mel)
+            lengths: (B,) lengths after CNN
+            notes_per_second: (B,) stream of control values
+            difficulty: (B,) difficulty values
+            level: (B,) level values
+        Returns:
+            Dict with:
+                'presence': (B, T_cnn_out, 3) # Corrected from 4 to 3
+                'lengths': lengths
+        """
+        # CNN frontend
+        x = self.cnn(mel)  # (B, C, F, T)
+        B, C, F, T = x.size()
+        x = x.permute(0, 3, 1, 2).reshape(B, T, C * F)
+        # Project to model dimension
+        x = self.proj(x)  # (B, T, D_MODEL)
+        # FiLM conditioning
+        nps = notes_per_second.unsqueeze(-1).float()  # (B, 1)
+        gamma_beta_nps = self.film_nps(nps)  # (B, 2*D_MODEL)
+        gamma_nps, beta_nps = gamma_beta_nps.chunk(2, dim=-1)
+        x = gamma_nps.unsqueeze(1) * x + beta_nps.unsqueeze(1)
+        diff = difficulty.unsqueeze(-1).float()  # (B, 1)
+        gamma_beta_diff = self.film_difficulty(diff)  # (B, 2*D_MODEL)
+        gamma_diff, beta_diff = gamma_beta_diff.chunk(2, dim=-1)
+        x = gamma_diff.unsqueeze(1) * x + beta_diff.unsqueeze(1)
+        lvl = level.unsqueeze(-1).float()  # (B, 1)
+        gamma_beta_lvl = self.film_level(lvl)  # (B, 2*D_MODEL)
+        gamma_lvl, beta_lvl = gamma_beta_lvl.chunk(2, dim=-1)
+        x = gamma_lvl.unsqueeze(1) * x + beta_lvl.unsqueeze(1)
+        # Conformer encoder
+        x, _ = self.encoder(x, lengths=lengths)
+        # Presence prediction
+        presence = self.presence_regressor(x)
+        return {"presence": presence, "lengths": lengths}
+if __name__ == "__main__":
+    model = TaikoConformer7().to(device=DEVICE)
+    print(model)
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(f"{name}: {param.numel():,}")
+    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total parameters: {params / 1e6:.2f}M")
+    batch_size = 4
+    mel_time_steps = 1024
+    input_mel = torch.randn(batch_size, 1, N_MELS, mel_time_steps).to(DEVICE)
+    conformer_lengths = torch.tensor(
+        [mel_time_steps] * batch_size, dtype=torch.long
+    ).to(DEVICE)
+    notes_per_second_input = torch.tensor([10.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )
+    difficulty_input = torch.tensor([1.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )  # Example difficulty
+    level_input = torch.tensor([5.0] * batch_size, dtype=torch.float32).to(
+        DEVICE
+    )  # Example level
+    output = model(
+        input_mel,
+        conformer_lengths,
+        notes_per_second_input,
+        difficulty_input,
+        level_input,
+    )
+    print("Output shapes:")
+    for key, value in output.items():
+        print(f"{key}: {value.shape}")

tc7/preprocess.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import math
+import torch
+import torch.nn as nn
+import torchaudio
+from torchaudio.transforms import FrequencyMasking
+from tja import parse_tja, PyParsingMode
+from .config import N_TYPES, SAMPLE_RATE, N_MELS, HOP_LENGTH, TIME_SUB
+from .model import TaikoConformer7
+mel_transform = torchaudio.transforms.MelSpectrogram(
+    sample_rate=SAMPLE_RATE,
+    n_mels=N_MELS,
+    hop_length=HOP_LENGTH,
+    n_fft=2048,
+)
+freq_mask = FrequencyMasking(freq_mask_param=15)
+def preprocess(example, difficulty="oni"):
+    wav_tensor = example["audio"]["array"]
+    sr = example["audio"]["sampling_rate"]
+    # 1) load & resample
+    if sr != SAMPLE_RATE:
+        wav_tensor = torchaudio.functional.resample(wav_tensor, sr, SAMPLE_RATE)
+    # normalize audio
+    wav_tensor = wav_tensor / (wav_tensor.abs().max() + 1e-8)
+    # add random Gaussian noise
+    if torch.rand(1).item() < 0.5:
+        wav_tensor = wav_tensor + 0.005 * torch.randn_like(wav_tensor)
+    # 2) mel: (1, N_MELS, T)
+    mel = mel_transform(wav_tensor).unsqueeze(0)
+    # apply SpecAugment
+    mel = freq_mask(mel)
+    _, _, T = mel.shape
+    # 3) build label sequence of length ceil(T / TIME_SUB)
+    T_sub = math.ceil(T / TIME_SUB)
+    # Initialize energy-based labels for Don, Ka, Drumroll
+    don_labels = torch.zeros(T_sub, dtype=torch.float32)
+    ka_labels = torch.zeros(T_sub, dtype=torch.float32)
+    drumroll_labels = torch.zeros(T_sub, dtype=torch.float32)
+    sliding_nps_labels = torch.zeros(
+        T_sub, dtype=torch.float32
+    )  # New label for sliding NPS
+    # Define exponential decay tail parameters
+    tail_length = 40  # number of frames for decay tail
+    decay_rate = 8.0  # decay rate parameter, adjust as needed
+    tail_kernel = torch.exp(
+        -torch.arange(0, tail_length, dtype=torch.float32) / decay_rate
+    )
+    fps = SAMPLE_RATE / HOP_LENGTH
+    num_valid_notes = 0
+    for onset in example[difficulty]:
+        typ, t_start, t_end, *_ = onset
+        # Assuming N_TYPES in config is appropriately set (e.g., 7 or more)
+        if typ < 1 or typ > N_TYPES:  # Filter out invalid types
+            continue
+        num_valid_notes += 1
+        exact_frame_start = t_start.item() * fps
+        # Type 1 and 3 are Don, Type 2 and 4 are Ka
+        if typ == 1 or typ == 3 or typ == 2 or typ == 4:
+            exact_hit_time_sub = exact_frame_start / TIME_SUB
+            current_labels = don_labels if (typ == 1 or typ == 3) else ka_labels
+            start_points_info = []
+            rounded_hit_time_sub = round(exact_hit_time_sub)
+            if (
+                abs(exact_hit_time_sub - rounded_hit_time_sub) < 1e-6
+            ):  # Tolerance for float precision
+                idx_single = int(rounded_hit_time_sub)
+                if 0 <= idx_single < T_sub:
+                    start_points_info.append({"idx": idx_single, "weight": 1.0})
+            else:
+                idx_floor = math.floor(exact_hit_time_sub)
+                idx_ceil = idx_floor + 1
+                frac = exact_hit_time_sub - idx_floor
+                weight_ceil = frac
+                weight_floor = 1.0 - frac
+                if weight_floor > 1e-6 and 0 <= idx_floor < T_sub:
+                    start_points_info.append({"idx": idx_floor, "weight": weight_floor})
+                if weight_ceil > 1e-6 and 0 <= idx_ceil < T_sub:
+                    start_points_info.append({"idx": idx_ceil, "weight": weight_ceil})
+            for point_info in start_points_info:
+                start_idx = point_info["idx"]
+                weight = point_info["weight"]
+                for k_idx, kernel_val in enumerate(tail_kernel):
+                    target_idx = start_idx + k_idx
+                    if 0 <= target_idx < T_sub:
+                        current_labels[target_idx] = max(
+                            current_labels[target_idx].item(),
+                            weight * kernel_val.item(),
+                        )
+        # Type 5, 6, 7 are Drumroll
+        elif typ >= 5 and typ <= 7:
+            exact_frame_end = t_end.item() * fps
+            exact_start_time_sub = exact_frame_start / TIME_SUB
+            exact_end_time_sub = exact_frame_end / TIME_SUB
+            # Improved drumroll body
+            body_loop_start_idx = math.floor(exact_start_time_sub)
+            body_loop_end_idx = math.ceil(exact_end_time_sub)
+            for dr_idx in range(body_loop_start_idx, body_loop_end_idx):
+                if 0 <= dr_idx < T_sub:
+                    drumroll_labels[dr_idx] = 1.0
+            # Improved drumroll tail (starts from exact_end_time_sub)
+            tail_start_points_info = []
+            rounded_end_time_sub = round(exact_end_time_sub)
+            if abs(exact_end_time_sub - rounded_end_time_sub) < 1e-6:
+                idx_single_tail = int(rounded_end_time_sub)
+                if 0 <= idx_single_tail < T_sub:
+                    tail_start_points_info.append(
+                        {"idx": idx_single_tail, "weight": 1.0}
+                    )
+            else:
+                idx_floor_tail = math.floor(exact_end_time_sub)
+                idx_ceil_tail = idx_floor_tail + 1
+                frac_tail = exact_end_time_sub - idx_floor_tail
+                weight_ceil_tail = frac_tail
+                weight_floor_tail = 1.0 - frac_tail
+                if weight_floor_tail > 1e-6 and 0 <= idx_floor_tail < T_sub:
+                    tail_start_points_info.append(
+                        {"idx": idx_floor_tail, "weight": weight_floor_tail}
+                    )
+                if weight_ceil_tail > 1e-6 and 0 <= idx_ceil_tail < T_sub:
+                    tail_start_points_info.append(
+                        {"idx": idx_ceil_tail, "weight": weight_ceil_tail}
+                    )
+            for point_info in tail_start_points_info:
+                start_idx = point_info["idx"]
+                weight = point_info["weight"]
+                for k_idx, kernel_val in enumerate(tail_kernel):
+                    target_idx = start_idx + k_idx
+                    if 0 <= target_idx < T_sub:
+                        drumroll_labels[target_idx] = max(
+                            drumroll_labels[target_idx].item(),
+                            weight * kernel_val.item(),
+                        )
+    # Calculate sliding window NPS
+    note_events = (
+        []
+    )  # Store tuples of (time_sec, type_is_drumroll_start_or_end, duration_if_drumroll)
+    for onset in example[difficulty]:
+        typ, t_start_tensor, t_end_tensor, *_ = onset
+        t_start = t_start_tensor.item()
+        t_end = t_end_tensor.item()
+        if typ in [1, 2, 3, 4]:  # Don or Ka
+            note_events.append(
+                (t_start, False, 0)
+            )  # False indicates not a drumroll event, duration 0
+        elif typ >= 5 and typ <= 7:  # Drumroll
+            note_events.append(
+                (t_start, True, t_end - t_start)
+            )  # True indicates drumroll start, store duration
+            # We don't explicitly need a drumroll end event for this calculation method
+    note_events.sort(key=lambda x: x[0])  # Sort by time
+    window_duration_seconds = 0.5
+    # drumroll_nps_rate = 10.0 # Removed: Will use adaptive rate
+    # Step 1: Calculate base_sliding_nps_labels (Don/Ka only)
+    base_don_ka_sliding_nps = torch.zeros(T_sub, dtype=torch.float32)
+    time_step_duration_sec = TIME_SUB / fps  # Duration of one T_sub segment
+    for k_idx in range(T_sub):
+        k_window_end_sec = ((k_idx + 1) * TIME_SUB) / fps
+        k_window_start_sec = k_window_end_sec - window_duration_seconds
+        current_don_ka_count = 0.0
+        for event_t, is_drumroll, _ in note_events:
+            if not is_drumroll:  # Don or Ka hit
+                if k_window_start_sec <= event_t < k_window_end_sec:
+                    current_don_ka_count += 1
+        base_don_ka_sliding_nps[k_idx] = current_don_ka_count / window_duration_seconds
+    # Step 2: Calculate adaptive_drumroll_rates_for_all_events
+    adaptive_drumroll_rates_for_all_events = []
+    for event_t, is_drumroll, drumroll_dur in note_events:
+        if is_drumroll:
+            drumroll_start_sec = event_t
+            drumroll_end_sec = event_t + drumroll_dur
+            slice_start_idx = math.floor(drumroll_start_sec / time_step_duration_sec)
+            slice_end_idx = math.ceil(drumroll_end_sec / time_step_duration_sec)
+            slice_start_idx = max(0, slice_start_idx)
+            slice_end_idx = min(T_sub, slice_end_idx)
+            max_nps_in_drumroll_period = 0.0
+            if slice_start_idx < slice_end_idx:
+                relevant_base_nps_values = base_don_ka_sliding_nps[
+                    slice_start_idx:slice_end_idx
+                ]
+                if relevant_base_nps_values.numel() > 0:
+                    max_nps_in_drumroll_period = torch.max(
+                        relevant_base_nps_values
+                    ).item()
+            rate = max(5.0, max_nps_in_drumroll_period)
+            adaptive_drumroll_rates_for_all_events.append(rate)
+        else:
+            adaptive_drumroll_rates_for_all_events.append(0.0)  # Placeholder
+    # Step 3: Calculate final sliding_nps_labels using adaptive rates
+    # sliding_nps_labels is already initialized with zeros earlier in the function.
+    for k_idx in range(T_sub):
+        k_window_end_sec = ((k_idx + 1) * TIME_SUB) / fps
+        k_window_start_sec = k_window_end_sec - window_duration_seconds
+        current_window_total_nps_contribution = 0.0
+        for event_idx, (event_t, is_drumroll, drumroll_dur) in enumerate(note_events):
+            if is_drumroll:
+                drumroll_start_sec = event_t
+                drumroll_end_sec = event_t + drumroll_dur
+                overlap_start = max(k_window_start_sec, drumroll_start_sec)
+                overlap_end = min(k_window_end_sec, drumroll_end_sec)
+                if overlap_end > overlap_start:
+                    overlap_duration = overlap_end - overlap_start
+                    current_adaptive_rate = adaptive_drumroll_rates_for_all_events[
+                        event_idx
+                    ]
+                    current_window_total_nps_contribution += (
+                        overlap_duration * current_adaptive_rate
+                    )
+            else:  # Don or Ka hit
+                if k_window_start_sec <= event_t < k_window_end_sec:
+                    current_window_total_nps_contribution += (
+                        1  # Each hit contributes 1 to the count
+                    )
+        sliding_nps_labels[k_idx] = (
+            current_window_total_nps_contribution / window_duration_seconds
+        )
+    # Normalize sliding_nps_labels to 0-1 range
+    if T_sub > 0:  # Ensure there are elements to normalize
+        min_nps_val = torch.min(sliding_nps_labels)
+        max_nps_val = torch.max(sliding_nps_labels)
+        denominator = max_nps_val - min_nps_val
+        if denominator > 1e-6:  # Use a small epsilon for float comparison
+            sliding_nps_labels = (sliding_nps_labels - min_nps_val) / denominator
+        else:
+            # If all values are (nearly) the same
+            if max_nps_val > 1e-6:  # If the constant value is positive
+                sliding_nps_labels = torch.ones_like(sliding_nps_labels)
+            else:  # If the constant value is zero (or very close to it)
+                sliding_nps_labels = torch.zeros_like(sliding_nps_labels)
+    duration_seconds = wav_tensor.shape[-1] / SAMPLE_RATE
+    nps = num_valid_notes / duration_seconds if duration_seconds > 0 else 0.0
+    parsed = parse_tja(example["tja"], mode=PyParsingMode.Full)
+    chart = next(
+        (chart for chart in parsed.charts if chart.course.lower() == difficulty), None
+    )
+    difficulty_id = (
+        0
+        if difficulty == "easy"
+        else (
+            1
+            if difficulty == "normal"
+            else 2 if difficulty == "hard" else 3 if difficulty == "oni" else 4
+        )  # Assuming 4 for edit/ura
+    )
+    level = chart.level if chart else 0
+    # --- CNN shape inference and label padding/truncation ---
+    # Simulate CNN to get output time length (T_cnn)
+    dummy_model = TaikoConformer7()
+    with torch.no_grad():
+        cnn_out = dummy_model.cnn(mel.unsqueeze(0))  # (1, C, F, T_cnn)
+        _, _, _, T_cnn = cnn_out.shape
+    # Pad or truncate labels to T_cnn
+    def pad_or_truncate(label, out_len):
+        if label.shape[0] < out_len:
+            pad = torch.zeros(out_len - label.shape[0], dtype=label.dtype)
+            return torch.cat([label, pad], dim=0)
+        else:
+            return label[:out_len]
+    don_labels = pad_or_truncate(don_labels, T_cnn)
+    ka_labels = pad_or_truncate(ka_labels, T_cnn)
+    drumroll_labels = pad_or_truncate(drumroll_labels, T_cnn)
+    sliding_nps_labels = pad_or_truncate(sliding_nps_labels, T_cnn)  # Pad new label
+    # For conformer input lengths: this should be T_cnn
+    conformer_sequence_length = T_cnn  # This is the actual sequence length after CNN
+    print(
+        f"Processed {num_valid_notes} notes in {duration_seconds:.2f} seconds, NPS: {nps:.2f}, Difficulty: {difficulty_id}, Level: {level}"
+    )
+    return {
+        "mel": mel,  # (1, N_MELS, T)
+        "don_labels": don_labels,  # (T_cnn,)
+        "ka_labels": ka_labels,  # (T_cnn,)
+        "drumroll_labels": drumroll_labels,  # (T_cnn,)
+        "sliding_nps_labels": sliding_nps_labels,  # Add new label (T_cnn,)
+        "nps": torch.tensor(nps, dtype=torch.float32),
+        "difficulty": torch.tensor(difficulty_id, dtype=torch.long),
+        "level": torch.tensor(level, dtype=torch.long),
+        "duration_seconds": torch.tensor(duration_seconds, dtype=torch.float32),
+        "length": torch.tensor(
+            conformer_sequence_length, dtype=torch.long
+        ),  # Use T_cnn for conformer and loss masking
+    }
+def collate_fn(batch):
+    mels_list = [b["mel"].squeeze(0).transpose(0, 1) for b in batch]  # (T, N_MELS)
+    don_labels_list = [b["don_labels"] for b in batch]
+    ka_labels_list = [b["ka_labels"] for b in batch]
+    drumroll_labels_list = [b["drumroll_labels"] for b in batch]
+    sliding_nps_labels_list = [b["sliding_nps_labels"] for b in batch]  # New label list
+    nps_list = [b["nps"] for b in batch]
+    difficulty_list = [b["difficulty"] for b in batch]
+    level_list = [b["level"] for b in batch]
+    durations_list = [b["duration_seconds"] for b in batch]
+    lengths_list = [b["length"] for b in batch]  # These are T_cnn_i for each example
+    # Pad mels
+    padded_mels = nn.utils.rnn.pad_sequence(
+        mels_list, batch_first=True
+    )  # (B, T_max_mel, N_MELS)
+    reshaped_mels = padded_mels.transpose(1, 2).unsqueeze(1)
+    # T_max_mel_batch = padded_mels.shape[1] # Max mel length in batch, not used for label padding anymore
+    # Determine max sequence length for labels (max T_cnn in batch)
+    max_label_len = 0
+    if lengths_list:  # handle empty batch case
+        max_label_len = max(l.item() for l in lengths_list) if lengths_list else 0
+    # Pad labels to max_label_len (max_t_cnn_in_batch)
+    def pad_label_to_max_len(label_tensor, target_len):
+        current_len = label_tensor.shape[0]
+        if current_len < target_len:
+            padding_size = target_len - current_len
+            # Ensure padding is created on the same device as the label_tensor
+            padding = torch.zeros(
+                padding_size, dtype=label_tensor.dtype, device=label_tensor.device
+            )
+            return torch.cat((label_tensor, padding), dim=0)
+        elif (
+            current_len > target_len
+        ):  # Should ideally not happen if lengths_list is correct
+            return label_tensor[:target_len]
+        return label_tensor
+    don_labels = torch.stack(
+        [pad_label_to_max_len(l, max_label_len) for l in don_labels_list]
+    )
+    ka_labels = torch.stack(
+        [pad_label_to_max_len(l, max_label_len) for l in ka_labels_list]
+    )
+    drumroll_labels = torch.stack(
+        [pad_label_to_max_len(l, max_label_len) for l in drumroll_labels_list]
+    )
+    sliding_nps_labels = torch.stack(
+        [pad_label_to_max_len(l, max_label_len) for l in sliding_nps_labels_list]
+    )  # Pad new labels
+    actual_lengths = torch.tensor([l.item() for l in lengths_list], dtype=torch.long)
+    return {
+        "mel": reshaped_mels,
+        "don_labels": don_labels,
+        "ka_labels": ka_labels,
+        "drumroll_labels": drumroll_labels,
+        "sliding_nps_labels": sliding_nps_labels,  # Add new batched labels
+        "lengths": actual_lengths,  # for conformer and loss masking (T_cnn_i for each item)
+        "nps": torch.stack(nps_list),
+        "difficulty": torch.stack(difficulty_list),
+        "level": torch.stack(level_list),
+        "durations": torch.stack(durations_list),
+    }

tc7/train.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from accelerate.utils import set_seed
+set_seed(1024)
+import math
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from datasets import concatenate_datasets
+import matplotlib.pyplot as plt
+import numpy as np
+from .config import (
+    BATCH_SIZE,
+    DEVICE,
+    EPOCHS,
+    LR,
+    GRAD_ACCUM_STEPS,
+    HOP_LENGTH,
+    NPS_PENALTY_WEIGHT_ALPHA,
+    NPS_PENALTY_WEIGHT_BETA,
+    SAMPLE_RATE,
+)
+from .model import TaikoConformer7
+from .dataset import ds
+from .preprocess import preprocess, collate_fn
+from .loss import TaikoLoss
+from huggingface_hub import upload_folder
+def log_energy_plots_to_tensorboard(
+    writer,
+    tag_prefix,
+    epoch,
+    pred_don,
+    pred_ka,
+    pred_drumroll,
+    true_don,
+    true_ka,
+    true_drumroll,
+    valid_length,
+    hop_sec,
+):
+    """
+    Logs a plot of predicted vs. true energies for one sample to TensorBoard.
+    Energies should be 1D numpy arrays for the single sample, up to valid_length.
+    """
+    pred_don = pred_don[:valid_length].detach().cpu().numpy()
+    pred_ka = pred_ka[:valid_length].detach().cpu().numpy()
+    pred_drumroll = pred_drumroll[:valid_length].detach().cpu().numpy()
+    true_don = true_don[:valid_length].cpu().numpy()
+    true_ka = true_ka[:valid_length].cpu().numpy()
+    true_drumroll = true_drumroll[:valid_length].cpu().numpy()
+    time_axis = np.arange(valid_length) * hop_sec
+    fig, axs = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
+    fig.suptitle(f"{tag_prefix} - Epoch {epoch}", fontsize=16)
+    axs[0].plot(time_axis, true_don, label="True Don", color="blue", linestyle="--")
+    axs[0].plot(time_axis, pred_don, label="Pred Don", color="lightblue", alpha=0.8)
+    axs[0].set_ylabel("Don Energy")
+    axs[0].legend()
+    axs[0].grid(True)
+    axs[1].plot(time_axis, true_ka, label="True Ka", color="red", linestyle="--")
+    axs[1].plot(time_axis, pred_ka, label="Pred Ka", color="lightcoral", alpha=0.8)
+    axs[1].set_ylabel("Ka Energy")
+    axs[1].legend()
+    axs[1].grid(True)
+    axs[2].plot(
+        time_axis, true_drumroll, label="True Drumroll", color="green", linestyle="--"
+    )
+    axs[2].plot(
+        time_axis, pred_drumroll, label="Pred Drumroll", color="lightgreen", alpha=0.8
+    )
+    axs[2].set_ylabel("Drumroll Energy")
+    axs[2].set_xlabel("Time (s)")
+    axs[2].legend()
+    axs[2].grid(True)
+    plt.tight_layout(rect=[0, 0, 1, 0.96])
+    writer.add_figure(f"{tag_prefix}/Energy_Comparison", fig, epoch)
+    plt.close(fig)
+def main():
+    global ds
+    output_frame_hop_sec = HOP_LENGTH / SAMPLE_RATE
+    best_val_loss = float("inf")
+    patience = 10
+    pat_count = 0
+    ds_oni = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "oni"},
+        writer_batch_size=10,
+    )
+    ds_hard = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "hard"},
+        writer_batch_size=10,
+    )
+    ds_normal = ds.map(
+        preprocess,
+        remove_columns=ds.column_names,
+        fn_kwargs={"difficulty": "normal"},
+        writer_batch_size=10,
+    )
+    ds = concatenate_datasets([ds_oni, ds_hard, ds_normal])
+    ds_train_test = ds.train_test_split(test_size=0.1, seed=42)
+    train_loader = DataLoader(
+        ds_train_test["train"],
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=8,
+        persistent_workers=True,
+        prefetch_factor=4,
+    )
+    val_loader = DataLoader(
+        ds_train_test["test"],
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        collate_fn=collate_fn,
+        num_workers=8,
+        persistent_workers=True,
+        prefetch_factor=4,
+    )
+    model = TaikoConformer7().to(DEVICE)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
+    criterion = TaikoLoss(
+        reduction="mean",
+        nps_penalty_weight_alpha=NPS_PENALTY_WEIGHT_ALPHA,
+        nps_penalty_weight_beta=NPS_PENALTY_WEIGHT_BETA,
+    ).to(DEVICE)
+    num_optimizer_steps_per_epoch = math.ceil(len(train_loader) / GRAD_ACCUM_STEPS)
+    total_optimizer_steps = EPOCHS * num_optimizer_steps_per_epoch
+    scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optimizer, max_lr=LR, total_steps=total_optimizer_steps
+    )
+    writer = SummaryWriter()
+    for epoch in range(1, EPOCHS + 1):
+        model.train()
+        total_epoch_loss = 0.0
+        optimizer.zero_grad()
+        for idx, batch in enumerate(tqdm(train_loader, desc=f"Train Epoch {epoch}")):
+            mel = batch["mel"].to(DEVICE)
+            lengths = batch["lengths"].to(DEVICE)
+            nps = batch["nps"].to(DEVICE)
+            difficulty = batch["difficulty"].to(DEVICE)
+            level = batch["level"].to(DEVICE)
+            outputs = model(mel, lengths, nps, difficulty, level)
+            loss = criterion(outputs, batch)
+            total_epoch_loss += loss.item()
+            loss = loss / GRAD_ACCUM_STEPS
+            loss.backward()
+            if (idx + 1) % GRAD_ACCUM_STEPS == 0 or (idx + 1) == len(train_loader):
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            writer.add_scalar(
+                "Loss/Train_Step",
+                loss.item() * GRAD_ACCUM_STEPS,
+                epoch * len(train_loader) + idx,
+            )
+            writer.add_scalar(
+                "LR", scheduler.get_last_lr()[0], epoch * len(train_loader) + idx
+            )
+            if idx < 3:
+                if mel.size(0) > 0:
+                    pred_don = outputs["presence"][0, :, 0]
+                    pred_ka = outputs["presence"][0, :, 1]
+                    pred_drumroll = outputs["presence"][0, :, 2]
+                    true_don = batch["don_labels"][0]
+                    true_ka = batch["ka_labels"][0]
+                    true_drumroll = batch["drumroll_labels"][0]
+                    valid_length = batch["lengths"][0].item()
+                    log_energy_plots_to_tensorboard(
+                        writer,
+                        f"Train_Sample_Batch_{idx}_Sample_0",
+                        epoch,
+                        pred_don,
+                        pred_ka,
+                        pred_drumroll,
+                        true_don,
+                        true_ka,
+                        true_drumroll,
+                        valid_length,
+                        output_frame_hop_sec,
+                    )
+        avg_train_loss = total_epoch_loss / len(train_loader)
+        writer.add_scalar("Loss/Train_Avg", avg_train_loss, epoch)
+        model.eval()
+        total_val_loss = 0.0
+        with torch.no_grad():
+            for idx, batch in enumerate(tqdm(val_loader, desc=f"Val Epoch {epoch}")):
+                mel = batch["mel"].to(DEVICE)
+                lengths = batch["lengths"].to(DEVICE)
+                nps = batch["nps"].to(DEVICE)
+                difficulty = batch["difficulty"].to(DEVICE)
+                level = batch["level"].to(DEVICE)
+                outputs = model(mel, lengths, nps, difficulty, level)
+                loss = criterion(outputs, batch)
+                total_val_loss += loss.item()
+                if idx < 3:
+                    if mel.size(0) > 0:
+                        pred_don = outputs["presence"][0, :, 0]
+                        pred_ka = outputs["presence"][0, :, 1]
+                        pred_drumroll = outputs["presence"][0, :, 2]
+                        true_don = batch["don_labels"][0]
+                        true_ka = batch["ka_labels"][0]
+                        true_drumroll = batch["drumroll_labels"][0]
+                        valid_length = batch["lengths"][0].item()
+                        log_energy_plots_to_tensorboard(
+                            writer,
+                            f"Val_Sample_Batch_{idx}_Sample_0",
+                            epoch,
+                            pred_don,
+                            pred_ka,
+                            pred_drumroll,
+                            true_don,
+                            true_ka,
+                            true_drumroll,
+                            valid_length,
+                            output_frame_hop_sec,
+                        )
+        avg_val_loss = total_val_loss / len(val_loader)
+        writer.add_scalar("Loss/Val_Avg", avg_val_loss, epoch)
+        current_lr = optimizer.param_groups[0]["lr"]
+        writer.add_scalar("LR/learning_rate", current_lr, epoch)
+        if "nps" in batch:
+            writer.add_scalar(
+                "NPS/GT_Val_LastBatch_Avg", batch["nps"].mean().item(), epoch
+            )
+        print(
+            f"Epoch {epoch:02d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | LR: {current_lr:.2e}"
+        )
+        if avg_val_loss < best_val_loss:
+            best_val_loss = avg_val_loss
+            pat_count = 0
+            torch.save(model.state_dict(), "best_model.pt")
+            print(f"Saved new best model to best_model.pt at epoch {epoch}")
+        else:
+            pat_count += 1
+            if pat_count >= patience:
+                print("Early stopping!")
+                break
+    writer.close()
+    model_id = "JacobLinCool/taiko-conformer-7"
+    try:
+        model.push_to_hub(
+            model_id, commit_message=f"Epoch {epoch}, Val Loss: {avg_val_loss:.4f}"
+        )
+        upload_folder(
+            repo_id=model_id,
+            folder_path="runs",
+            path_in_repo="runs",
+            commit_message="Upload TensorBoard logs",
+        )
+    except Exception as e:
+        print(f"Error uploading model or logs: {e}")
+        print("Make sure you have the correct permissions and try again.")
+if __name__ == "__main__":
+    main()