Spaces:
Sleeping
Sleeping
| """ | |
| Audio processing module for zero-shot keyword spotting. | |
| Handles audio loading, preprocessing, and feature extraction. | |
| """ | |
| import librosa | |
| import numpy as np | |
| import torch | |
| from typing import Union, Tuple | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| class AudioProcessor: | |
| """Handles audio preprocessing for the keyword spotting model.""" | |
| def __init__(self, target_sample_rate: int = 48000, max_duration: float = 30.0): | |
| """ | |
| Initialize the audio processor. | |
| Args: | |
| target_sample_rate: Target sampling rate for audio processing | |
| max_duration: Maximum audio duration in seconds | |
| """ | |
| self.target_sample_rate = target_sample_rate | |
| self.max_duration = max_duration | |
| self.max_samples = int(target_sample_rate * max_duration) | |
| def load_audio(self, audio_path: str) -> Tuple[np.ndarray, int]: | |
| """ | |
| Load audio file and return waveform and sample rate. | |
| Args: | |
| audio_path: Path to the audio file | |
| Returns: | |
| Tuple of (waveform, sample_rate) | |
| """ | |
| try: | |
| # Use librosa for robust audio loading | |
| waveform, sr = librosa.load(audio_path, sr=None) | |
| return waveform, sr | |
| except Exception as e: | |
| raise ValueError(f"Error loading audio file: {str(e)}") | |
| def preprocess_audio(self, waveform: np.ndarray, sample_rate: int) -> torch.Tensor: | |
| """ | |
| Preprocess audio waveform for model input. | |
| Args: | |
| waveform: Audio waveform as numpy array | |
| sample_rate: Original sample rate | |
| Returns: | |
| Preprocessed audio tensor | |
| """ | |
| # Convert to float32 if needed | |
| if waveform.dtype != np.float32: | |
| waveform = waveform.astype(np.float32) | |
| # Resample if necessary | |
| if sample_rate != self.target_sample_rate: | |
| waveform = librosa.resample( | |
| waveform, | |
| orig_sr=sample_rate, | |
| target_sr=self.target_sample_rate | |
| ) | |
| # Ensure mono audio | |
| if len(waveform.shape) > 1: | |
| waveform = librosa.to_mono(waveform) | |
| # Trim or pad to max duration | |
| if len(waveform) > self.max_samples: | |
| # Trim to max duration | |
| waveform = waveform[:self.max_samples] | |
| elif len(waveform) < self.max_samples: | |
| # Pad with zeros | |
| padding = self.max_samples - len(waveform) | |
| waveform = np.pad(waveform, (0, padding), mode='constant', constant_values=0) | |
| # Normalize audio | |
| waveform = self._normalize_audio(waveform) | |
| # Convert to tensor | |
| audio_tensor = torch.from_numpy(waveform).float() | |
| return audio_tensor | |
| def _normalize_audio(self, waveform: np.ndarray) -> np.ndarray: | |
| """ | |
| Normalize audio waveform. | |
| Args: | |
| waveform: Input waveform | |
| Returns: | |
| Normalized waveform | |
| """ | |
| # RMS normalization | |
| rms = np.sqrt(np.mean(waveform**2)) | |
| if rms > 0: | |
| waveform = waveform / (rms * 10) # Scale down to prevent clipping | |
| # Clip to [-1, 1] range | |
| waveform = np.clip(waveform, -1.0, 1.0) | |
| return waveform | |
| def process_audio_file(self, audio_path: str) -> torch.Tensor: | |
| """ | |
| Complete audio processing pipeline from file to tensor. | |
| Args: | |
| audio_path: Path to audio file | |
| Returns: | |
| Preprocessed audio tensor ready for model input | |
| """ | |
| waveform, sample_rate = self.load_audio(audio_path) | |
| processed_audio = self.preprocess_audio(waveform, sample_rate) | |
| return processed_audio | |
| def process_audio_array(self, audio_array: np.ndarray, sample_rate: int) -> torch.Tensor: | |
| """ | |
| Process audio from numpy array (e.g., from Gradio microphone input). | |
| Args: | |
| audio_array: Audio data as numpy array | |
| sample_rate: Sample rate of the audio | |
| Returns: | |
| Preprocessed audio tensor | |
| """ | |
| return self.preprocess_audio(audio_array, sample_rate) | |