Spaces:
Sleeping
Sleeping
File size: 7,401 Bytes
edec2f0 8701643 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 6349797 edec2f0 8701643 edec2f0 6349797 8701643 6349797 8701643 6349797 8701643 6349797 edec2f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
import gradio as gr
import spaces
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
# Model IDs
FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish"
STOCK_MODEL_ID = "openai/whisper-large-v3-turbo"
# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load fine-tuned model
print("Loading fine-tuned model...")
finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained(
FINETUNED_MODEL_ID,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
finetuned_model.to(device)
finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID)
finetuned_pipe = pipeline(
"automatic-speech-recognition",
model=finetuned_model,
tokenizer=finetuned_processor.tokenizer,
feature_extractor=finetuned_processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
# Load stock model
print("Loading stock model...")
stock_model = AutoModelForSpeechSeq2Seq.from_pretrained(
STOCK_MODEL_ID,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True
)
stock_model.to(device)
stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID)
stock_pipe = pipeline(
"automatic-speech-recognition",
model=stock_model,
tokenizer=stock_processor.tokenizer,
feature_extractor=stock_processor.feature_extractor,
torch_dtype=torch_dtype,
device=device,
)
@spaces.GPU
def transcribe_both(audio):
"""Transcribe audio using both fine-tuned and stock models."""
if audio is None:
return "Please record or upload audio.", "Please record or upload audio."
# Transcribe with fine-tuned model
finetuned_result = finetuned_pipe(audio)
finetuned_text = finetuned_result["text"]
# Transcribe with stock model
stock_result = stock_pipe(audio)
stock_text = stock_result["text"]
return finetuned_text, stock_text
# Custom CSS for styling
custom_css = """
#finetuned-output {
border: 3px solid #10b981 !important;
border-radius: 8px !important;
padding: 16px !important;
}
#stock-output {
border: 3px solid #3b82f6 !important;
border-radius: 8px !important;
padding: 16px !important;
}
.model-label {
font-size: 24px !important;
font-weight: bold !important;
margin-bottom: 12px !important;
}
#finetuned-label {
color: #10b981 !important;
}
#stock-label {
color: #3b82f6 !important;
}
.gradio-container {
max-width: 1200px !important;
}
"""
# Create Gradio interface
with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo:
gr.Markdown("# Whisper Hebrish: Side By Side Demo")
with gr.Tabs():
with gr.Tab("Compare Models"):
gr.Markdown(
"""
Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech)
with the stock **Whisper Large V3 Turbo** model.
**Record or upload audio** to see side-by-side transcription results.
"""
)
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or Upload Audio"
)
transcribe_btn = gr.Button("π Transcribe with Both Models", variant="primary", size="lg")
gr.Markdown("---")
with gr.Row():
with gr.Column():
gr.Markdown(
'<div class="model-label" id="finetuned-label">β¨ WHISPER-HEBRISH (Fine-Tuned)</div>'
)
finetuned_output = gr.Textbox(
label="Transcription",
placeholder="Fine-tuned model output will appear here...",
lines=8,
elem_id="finetuned-output"
)
with gr.Column():
gr.Markdown(
'<div class="model-label" id="stock-label">π¦ WHISPER LARGE V3 TURBO (Stock)</div>'
)
stock_output = gr.Textbox(
label="Transcription",
placeholder="Stock model output will appear here...",
lines=8,
elem_id="stock-output"
)
# Connect the button to the transcription function
transcribe_btn.click(
fn=transcribe_both,
inputs=[audio_input],
outputs=[finetuned_output, stock_output]
)
with gr.Tab("About"):
gr.Markdown(
"""
## About This Demo
This Space provides a **side-by-side comparison** of automatic speech recognition between:
- **Whisper-Hebrish**: A fine-tuned version optimized for mixed Hebrew-English speech patterns
- **Whisper Large V3 Turbo**: OpenAI's stock model for general-purpose transcription
"""
)
# Screenshots
with gr.Row():
with gr.Column():
gr.Image("screenshots/1.png", label="Demo Interface", show_label=True)
with gr.Column():
gr.Image("screenshots/2.png", label="Comparison Results", show_label=True)
gr.Markdown(
"""
## How to Use
1. **Record** audio using your microphone or **upload** an audio file
2. Click **"Transcribe with Both Models"**
3. Compare the results side-by-side
## About Whisper-Hebrish
Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages.
### Model Information
π€ **Model:** [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish)
The fine-tuned model has been trained to better handle:
- Mixed Hebrew-English speech
- Switching between Hebrew and English
- Hebrew-specific phonetic patterns
- Common transliteration patterns
## Technical Details
- Inference powered by Zero GPU
- Both models run simultaneously for direct comparison
- Supports both microphone recording and file upload
- Based on Whisper Large V3 architecture
## Use Cases
This comparison tool is useful for:
- Evaluating model performance on bilingual content
- Testing speech recognition accuracy for mixed-language scenarios
- Understanding the benefits of fine-tuning for specific language patterns
- Benchmarking ASR quality for mixed Hebrew-English speech
---
*Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.*
"""
)
if __name__ == "__main__":
demo.launch()
|