"""
Audio processing module for zero-shot keyword spotting.
Handles audio loading, preprocessing, and feature extraction.
"""

import librosa
import numpy as np
import torch
from typing import Union, Tuple
import warnings

warnings.filterwarnings("ignore")


class AudioProcessor:
    """Handles audio preprocessing for the keyword spotting model."""
    
    def __init__(self, target_sample_rate: int = 48000, max_duration: float = 30.0):
        """
        Initialize the audio processor.
        
        Args:
            target_sample_rate: Target sampling rate for audio processing
            max_duration: Maximum audio duration in seconds
        """
        self.target_sample_rate = target_sample_rate
        self.max_duration = max_duration
        self.max_samples = int(target_sample_rate * max_duration)
    
    def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]:
        """
        Load audio file and return waveform and sample rate.
        
        Args:
            audio_path: Path to the audio file
            
        Returns:
            Tuple of (waveform, sample_rate)
        """
        try:
            # Use librosa for robust audio loading
            waveform, sr = librosa.load(audio_path, sr=None)
            return waveform, sr
        except Exception as e:
            raise ValueError(f"Error loading audio file: {str(e)}")
    
    def preprocess_audio(self, waveform: np.ndarray, sample_rate: int) -> torch.Tensor:
        """
        Preprocess audio waveform for model input.
        
        Args:
            waveform: Audio waveform as numpy array
            sample_rate: Original sample rate
            
        Returns:
            Preprocessed audio tensor
        """
        # Convert to float32 if needed
        if waveform.dtype != np.float32:
            waveform = waveform.astype(np.float32)
        
        # Resample if necessary
        if sample_rate != self.target_sample_rate:
            waveform = librosa.resample(
                waveform, 
                orig_sr=sample_rate, 
                target_sr=self.target_sample_rate
            )
        
        # Ensure mono audio
        if len(waveform.shape) > 1:
            waveform = librosa.to_mono(waveform)
        
        # Trim or pad to max duration
        if len(waveform) > self.max_samples:
            # Trim to max duration
            waveform = waveform[:self.max_samples]
        elif len(waveform) < self.max_samples:
            # Pad with zeros
            padding = self.max_samples - len(waveform)
            waveform = np.pad(waveform, (0, padding), mode='constant', constant_values=0)
        
        # Normalize audio
        waveform = self._normalize_audio(waveform)
        
        # Convert to tensor
        audio_tensor = torch.from_numpy(waveform).float()
        
        return audio_tensor
    
    def _normalize_audio(self, waveform: np.ndarray) -> np.ndarray:
        """
        Normalize audio waveform.
        
        Args:
            waveform: Input waveform
            
        Returns:
            Normalized waveform
        """
        # RMS normalization
        rms = np.sqrt(np.mean(waveform**2))
        if rms > 0:
            waveform = waveform / (rms * 10)  # Scale down to prevent clipping
        
        # Clip to [-1, 1] range
        waveform = np.clip(waveform, -1.0, 1.0)
        
        return waveform
    
    def process_audio_file(self, audio_path: str) -> torch.Tensor:
        """
        Complete audio processing pipeline from file to tensor.
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Preprocessed audio tensor ready for model input
        """
        waveform, sample_rate = self.load_audio(audio_path)
        processed_audio = self.preprocess_audio(waveform, sample_rate)
        return processed_audio
    
    def process_audio_array(self, audio_array: np.ndarray, sample_rate: int) -> torch.Tensor:
        """
        Process audio from numpy array (e.g., from Gradio microphone input).
        
        Args:
            audio_array: Audio data as numpy array
            sample_rate: Sample rate of the audio
            
        Returns:
            Preprocessed audio tensor
        """
        return self.preprocess_audio(audio_array, sample_rate)