Whisper-Hebrish / app.py
danielrosehill's picture
commit
8701643
import gradio as gr
import spaces
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
# Model IDs
FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish"
STOCK_MODEL_ID = "openai/whisper-large-v3-turbo"
# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load fine-tuned model
print("Loading fine-tuned model...")
finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained(
FINETUNED_MODEL_ID,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
finetuned_model.to(device)
finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID)
finetuned_pipe = pipeline(
"automatic-speech-recognition",
model=finetuned_model,
tokenizer=finetuned_processor.tokenizer,
feature_extractor=finetuned_processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
# Load stock model
print("Loading stock model...")
stock_model = AutoModelForSpeechSeq2Seq.from_pretrained(
STOCK_MODEL_ID,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
stock_model.to(device)
stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID)
stock_pipe = pipeline(
"automatic-speech-recognition",
model=stock_model,
tokenizer=stock_processor.tokenizer,
feature_extractor=stock_processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
@spaces.GPU
def transcribe_both(audio):
"""Transcribe audio using both fine-tuned and stock models."""
if audio is None:
return "Please record or upload audio.", "Please record or upload audio."
# Transcribe with fine-tuned model
finetuned_result = finetuned_pipe(audio)
finetuned_text = finetuned_result["text"]
# Transcribe with stock model
stock_result = stock_pipe(audio)
stock_text = stock_result["text"]
return finetuned_text, stock_text
# Custom CSS for styling
custom_css = """
#finetuned-output {
border: 3px solid #10b981 !important;
border-radius: 8px !important;
padding: 16px !important;
}
#stock-output {
border: 3px solid #3b82f6 !important;
border-radius: 8px !important;
padding: 16px !important;
}
.model-label {
font-size: 24px !important;
font-weight: bold !important;
margin-bottom: 12px !important;
}
#finetuned-label {
color: #10b981 !important;
}
#stock-label {
color: #3b82f6 !important;
}
.gradio-container {
max-width: 1200px !important;
}
"""
# Create Gradio interface
with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo:
gr.Markdown("# Whisper Hebrish: Side By Side Demo")
with gr.Tabs():
with gr.Tab("Compare Models"):
gr.Markdown(
"""
Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech)
with the stock **Whisper Large V3 Turbo** model.
**Record or upload audio** to see side-by-side transcription results.
"""
)
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or Upload Audio"
)
transcribe_btn = gr.Button("πŸš€ Transcribe with Both Models", variant="primary", size="lg")
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown(
'<div class="model-label" id="finetuned-label">✨ WHISPER-HEBRISH (Fine-Tuned)</div>'
)
finetuned_output = gr.Textbox(
label="Transcription",
placeholder="Fine-tuned model output will appear here...",
lines=8,
elem_id="finetuned-output"
)
with gr.Column():
gr.Markdown(
'<div class="model-label" id="stock-label">πŸ“¦ WHISPER LARGE V3 TURBO (Stock)</div>'
)
stock_output = gr.Textbox(
label="Transcription",
placeholder="Stock model output will appear here...",
lines=8,
elem_id="stock-output"
)
# Connect the button to the transcription function
transcribe_btn.click(
fn=transcribe_both,
inputs=[audio_input],
outputs=[finetuned_output, stock_output]
)
with gr.Tab("About"):
gr.Markdown(
"""
## About This Demo
This Space provides a **side-by-side comparison** of automatic speech recognition between:
- **Whisper-Hebrish**: A fine-tuned version optimized for mixed Hebrew-English speech patterns
- **Whisper Large V3 Turbo**: OpenAI's stock model for general-purpose transcription
"""
)
# Screenshots
with gr.Row():
with gr.Column():
gr.Image("screenshots/1.png", label="Demo Interface", show_label=True)
with gr.Column():
gr.Image("screenshots/2.png", label="Comparison Results", show_label=True)
gr.Markdown(
"""
## How to Use
1. **Record** audio using your microphone or **upload** an audio file
2. Click **"Transcribe with Both Models"**
3. Compare the results side-by-side
## About Whisper-Hebrish
Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages.
### Model Information
πŸ€— **Model:** [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish)
The fine-tuned model has been trained to better handle:
- Mixed Hebrew-English speech
- Switching between Hebrew and English
- Hebrew-specific phonetic patterns
- Common transliteration patterns
## Technical Details
- Inference powered by Zero GPU
- Both models run simultaneously for direct comparison
- Supports both microphone recording and file upload
- Based on Whisper Large V3 architecture
## Use Cases
This comparison tool is useful for:
- Evaluating model performance on bilingual content
- Testing speech recognition accuracy for mixed-language scenarios
- Understanding the benefits of fine-tuning for specific language patterns
- Benchmarking ASR quality for mixed Hebrew-English speech
---
*Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.*
"""
)
if __name__ == "__main__":
demo.launch()