import gradio as gr import spaces from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import torch # Model IDs FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish" STOCK_MODEL_ID = "openai/whisper-large-v3-turbo" # Device configuration device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load fine-tuned model print("Loading fine-tuned model...") finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained( FINETUNED_MODEL_ID, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) finetuned_model.to(device) finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID) finetuned_pipe = pipeline( "automatic-speech-recognition", model=finetuned_model, tokenizer=finetuned_processor.tokenizer, feature_extractor=finetuned_processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) # Load stock model print("Loading stock model...") stock_model = AutoModelForSpeechSeq2Seq.from_pretrained( STOCK_MODEL_ID, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) stock_model.to(device) stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID) stock_pipe = pipeline( "automatic-speech-recognition", model=stock_model, tokenizer=stock_processor.tokenizer, feature_extractor=stock_processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) @spaces.GPU def transcribe_both(audio): """Transcribe audio using both fine-tuned and stock models.""" if audio is None: return "Please record or upload audio.", "Please record or upload audio." # Transcribe with fine-tuned model finetuned_result = finetuned_pipe(audio) finetuned_text = finetuned_result["text"] # Transcribe with stock model stock_result = stock_pipe(audio) stock_text = stock_result["text"] return finetuned_text, stock_text # Custom CSS for styling custom_css = """ #finetuned-output { border: 3px solid #10b981 !important; border-radius: 8px !important; padding: 16px !important; } #stock-output { border: 3px solid #3b82f6 !important; border-radius: 8px !important; padding: 16px !important; } .model-label { font-size: 24px !important; font-weight: bold !important; margin-bottom: 12px !important; } #finetuned-label { color: #10b981 !important; } #stock-label { color: #3b82f6 !important; } .gradio-container { max-width: 1200px !important; } """ # Create Gradio interface with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo: gr.Markdown("# Whisper Hebrish: Side By Side Demo") with gr.Tabs(): with gr.Tab("Compare Models"): gr.Markdown( """ Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech) with the stock **Whisper Large V3 Turbo** model. **Record or upload audio** to see side-by-side transcription results. """ ) with gr.Row(): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio" ) transcribe_btn = gr.Button("🚀 Transcribe with Both Models", variant="primary", size="lg") gr.Markdown("---") with gr.Row(): with gr.Column(): gr.Markdown( '
✨ WHISPER-HEBRISH (Fine-Tuned)
' ) finetuned_output = gr.Textbox( label="Transcription", placeholder="Fine-tuned model output will appear here...", lines=8, elem_id="finetuned-output" ) with gr.Column(): gr.Markdown( '
📦 WHISPER LARGE V3 TURBO (Stock)
' ) stock_output = gr.Textbox( label="Transcription", placeholder="Stock model output will appear here...", lines=8, elem_id="stock-output" ) # Connect the button to the transcription function transcribe_btn.click( fn=transcribe_both, inputs=[audio_input], outputs=[finetuned_output, stock_output] ) with gr.Tab("About"): gr.Markdown( """ ## About This Demo This Space provides a **side-by-side comparison** of automatic speech recognition between: - **Whisper-Hebrish**: A fine-tuned version optimized for mixed Hebrew-English speech patterns - **Whisper Large V3 Turbo**: OpenAI's stock model for general-purpose transcription """ ) # Screenshots with gr.Row(): with gr.Column(): gr.Image("screenshots/1.png", label="Demo Interface", show_label=True) with gr.Column(): gr.Image("screenshots/2.png", label="Comparison Results", show_label=True) gr.Markdown( """ ## How to Use 1. **Record** audio using your microphone or **upload** an audio file 2. Click **"Transcribe with Both Models"** 3. Compare the results side-by-side ## About Whisper-Hebrish Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages. ### Model Information 🤗 **Model:** [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish) The fine-tuned model has been trained to better handle: - Mixed Hebrew-English speech - Switching between Hebrew and English - Hebrew-specific phonetic patterns - Common transliteration patterns ## Technical Details - Inference powered by Zero GPU - Both models run simultaneously for direct comparison - Supports both microphone recording and file upload - Based on Whisper Large V3 architecture ## Use Cases This comparison tool is useful for: - Evaluating model performance on bilingual content - Testing speech recognition accuracy for mixed-language scenarios - Understanding the benefits of fine-tuning for specific language patterns - Benchmarking ASR quality for mixed Hebrew-English speech --- *Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.* """ ) if __name__ == "__main__": demo.launch()