Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import spaces | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import torch | |
| # Model IDs | |
| FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish" | |
| STOCK_MODEL_ID = "openai/whisper-large-v3-turbo" | |
| # Device configuration | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| # Load fine-tuned model | |
| print("Loading fine-tuned model...") | |
| finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| FINETUNED_MODEL_ID, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| finetuned_model.to(device) | |
| finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID) | |
| finetuned_pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=finetuned_model, | |
| tokenizer=finetuned_processor.tokenizer, | |
| feature_extractor=finetuned_processor.feature_extractor, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| # Load stock model | |
| print("Loading stock model...") | |
| stock_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| STOCK_MODEL_ID, | |
| torch_dtype=torch_dtype, | |
| low_cpu_mem_usage=True, | |
| use_safetensors=True | |
| ) | |
| stock_model.to(device) | |
| stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID) | |
| stock_pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=stock_model, | |
| tokenizer=stock_processor.tokenizer, | |
| feature_extractor=stock_processor.feature_extractor, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| def transcribe_both(audio): | |
| """Transcribe audio using both fine-tuned and stock models.""" | |
| if audio is None: | |
| return "Please record or upload audio.", "Please record or upload audio." | |
| # Transcribe with fine-tuned model | |
| finetuned_result = finetuned_pipe(audio) | |
| finetuned_text = finetuned_result["text"] | |
| # Transcribe with stock model | |
| stock_result = stock_pipe(audio) | |
| stock_text = stock_result["text"] | |
| return finetuned_text, stock_text | |
| # Custom CSS for styling | |
| custom_css = """ | |
| #finetuned-output { | |
| border: 3px solid #10b981 !important; | |
| border-radius: 8px !important; | |
| padding: 16px !important; | |
| } | |
| #stock-output { | |
| border: 3px solid #3b82f6 !important; | |
| border-radius: 8px !important; | |
| padding: 16px !important; | |
| } | |
| .model-label { | |
| font-size: 24px !important; | |
| font-weight: bold !important; | |
| margin-bottom: 12px !important; | |
| } | |
| #finetuned-label { | |
| color: #10b981 !important; | |
| } | |
| #stock-label { | |
| color: #3b82f6 !important; | |
| } | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| # Create Gradio interface | |
| with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo: | |
| gr.Markdown("# Whisper Hebrish: Side By Side Demo") | |
| with gr.Tabs(): | |
| with gr.Tab("Compare Models"): | |
| gr.Markdown( | |
| """ | |
| Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech) | |
| with the stock **Whisper Large V3 Turbo** model. | |
| **Record or upload audio** to see side-by-side transcription results. | |
| """ | |
| ) | |
| with gr.Row(): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Record or Upload Audio" | |
| ) | |
| transcribe_btn = gr.Button("π Transcribe with Both Models", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown( | |
| '<div class="model-label" id="finetuned-label">β¨ WHISPER-HEBRISH (Fine-Tuned)</div>' | |
| ) | |
| finetuned_output = gr.Textbox( | |
| label="Transcription", | |
| placeholder="Fine-tuned model output will appear here...", | |
| lines=8, | |
| elem_id="finetuned-output" | |
| ) | |
| with gr.Column(): | |
| gr.Markdown( | |
| '<div class="model-label" id="stock-label">π¦ WHISPER LARGE V3 TURBO (Stock)</div>' | |
| ) | |
| stock_output = gr.Textbox( | |
| label="Transcription", | |
| placeholder="Stock model output will appear here...", | |
| lines=8, | |
| elem_id="stock-output" | |
| ) | |
| # Connect the button to the transcription function | |
| transcribe_btn.click( | |
| fn=transcribe_both, | |
| inputs=[audio_input], | |
| outputs=[finetuned_output, stock_output] | |
| ) | |
| with gr.Tab("About"): | |
| gr.Markdown( | |
| """ | |
| ## About This Demo | |
| This Space provides a **side-by-side comparison** of automatic speech recognition between: | |
| - **Whisper-Hebrish**: A fine-tuned version optimized for mixed Hebrew-English speech patterns | |
| - **Whisper Large V3 Turbo**: OpenAI's stock model for general-purpose transcription | |
| """ | |
| ) | |
| # Screenshots | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Image("screenshots/1.png", label="Demo Interface", show_label=True) | |
| with gr.Column(): | |
| gr.Image("screenshots/2.png", label="Comparison Results", show_label=True) | |
| gr.Markdown( | |
| """ | |
| ## How to Use | |
| 1. **Record** audio using your microphone or **upload** an audio file | |
| 2. Click **"Transcribe with Both Models"** | |
| 3. Compare the results side-by-side | |
| ## About Whisper-Hebrish | |
| Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages. | |
| ### Model Information | |
| π€ **Model:** [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish) | |
| The fine-tuned model has been trained to better handle: | |
| - Mixed Hebrew-English speech | |
| - Switching between Hebrew and English | |
| - Hebrew-specific phonetic patterns | |
| - Common transliteration patterns | |
| ## Technical Details | |
| - Inference powered by Zero GPU | |
| - Both models run simultaneously for direct comparison | |
| - Supports both microphone recording and file upload | |
| - Based on Whisper Large V3 architecture | |
| ## Use Cases | |
| This comparison tool is useful for: | |
| - Evaluating model performance on bilingual content | |
| - Testing speech recognition accuracy for mixed-language scenarios | |
| - Understanding the benefits of fine-tuning for specific language patterns | |
| - Benchmarking ASR quality for mixed Hebrew-English speech | |
| --- | |
| *Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.* | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |