import gradio as gr import spaces from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import torch # Model IDs FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish" STOCK_MODEL_ID = "openai/whisper-large-v3-turbo" # Device configuration device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load fine-tuned model print("Loading fine-tuned model...") finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained( FINETUNED_MODEL_ID, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) finetuned_model.to(device) finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID) finetuned_pipe = pipeline( "automatic-speech-recognition", model=finetuned_model, tokenizer=finetuned_processor.tokenizer, feature_extractor=finetuned_processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) # Load stock model print("Loading stock model...") stock_model = AutoModelForSpeechSeq2Seq.from_pretrained( STOCK_MODEL_ID, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) stock_model.to(device) stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID) stock_pipe = pipeline( "automatic-speech-recognition", model=stock_model, tokenizer=stock_processor.tokenizer, feature_extractor=stock_processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) @spaces.GPU def transcribe_both(audio): """Transcribe audio using both fine-tuned and stock models.""" if audio is None: return "Please record or upload audio.", "Please record or upload audio." # Transcribe with fine-tuned model finetuned_result = finetuned_pipe(audio) finetuned_text = finetuned_result["text"] # Transcribe with stock model stock_result = stock_pipe(audio) stock_text = stock_result["text"] return finetuned_text, stock_text # Custom CSS for styling custom_css = """ #finetuned-output { border: 3px solid #10b981 !important; border-radius: 8px !important; padding: 16px !important; } #stock-output { border: 3px solid #3b82f6 !important; border-radius: 8px !important; padding: 16px !important; } .model-label { font-size: 24px !important; font-weight: bold !important; margin-bottom: 12px !important; } #finetuned-label { color: #10b981 !important; } #stock-label { color: #3b82f6 !important; } .gradio-container { max-width: 1200px !important; } """ # Create Gradio interface with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo: gr.Markdown("# Whisper Hebrish: Side By Side Demo") with gr.Tabs(): with gr.Tab("Compare Models"): gr.Markdown( """ Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech) with the stock **Whisper Large V3 Turbo** model. **Record or upload audio** to see side-by-side transcription results. """ ) with gr.Row(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio" ) transcribe_btn = gr.Button("🚀 Transcribe with Both Models", variant="primary", size="lg") gr.Markdown("---") with gr.Row(): with gr.Column(): gr.Markdown( '