File size: 7,401 Bytes
edec2f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8701643
 
edec2f0
6349797
 
 
 
 
 
edec2f0
6349797
 
 
edec2f0
6349797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edec2f0
6349797
 
 
 
edec2f0
6349797
edec2f0
6349797
 
 
edec2f0
 
6349797
 
 
 
 
 
 
edec2f0
6349797
 
edec2f0
6349797
 
 
edec2f0
6349797
edec2f0
8701643
edec2f0
6349797
 
 
 
 
 
8701643
6349797
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8701643
6349797
 
 
8701643
6349797
 
edec2f0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import gradio as gr
import spaces
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch

# Model IDs
FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish"
STOCK_MODEL_ID = "openai/whisper-large-v3-turbo"

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load fine-tuned model
print("Loading fine-tuned model...")
finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    FINETUNED_MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
finetuned_model.to(device)
finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID)
finetuned_pipe = pipeline(
    "automatic-speech-recognition",
    model=finetuned_model,
    tokenizer=finetuned_processor.tokenizer,
    feature_extractor=finetuned_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

# Load stock model
print("Loading stock model...")
stock_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    STOCK_MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
stock_model.to(device)
stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID)
stock_pipe = pipeline(
    "automatic-speech-recognition",
    model=stock_model,
    tokenizer=stock_processor.tokenizer,
    feature_extractor=stock_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

@spaces.GPU
def transcribe_both(audio):
    """Transcribe audio using both fine-tuned and stock models."""
    if audio is None:
        return "Please record or upload audio.", "Please record or upload audio."

    # Transcribe with fine-tuned model
    finetuned_result = finetuned_pipe(audio)
    finetuned_text = finetuned_result["text"]

    # Transcribe with stock model
    stock_result = stock_pipe(audio)
    stock_text = stock_result["text"]

    return finetuned_text, stock_text

# Custom CSS for styling
custom_css = """
#finetuned-output {
    border: 3px solid #10b981 !important;
    border-radius: 8px !important;
    padding: 16px !important;
}

#stock-output {
    border: 3px solid #3b82f6 !important;
    border-radius: 8px !important;
    padding: 16px !important;
}

.model-label {
    font-size: 24px !important;
    font-weight: bold !important;
    margin-bottom: 12px !important;
}

#finetuned-label {
    color: #10b981 !important;
}

#stock-label {
    color: #3b82f6 !important;
}

.gradio-container {
    max-width: 1200px !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo:
    gr.Markdown("# Whisper Hebrish: Side By Side Demo")

    with gr.Tabs():
        with gr.Tab("Compare Models"):
            gr.Markdown(
                """
                Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech)
                with the stock **Whisper Large V3 Turbo** model.

                **Record or upload audio** to see side-by-side transcription results.
                """
            )

            with gr.Row():
                audio_input = gr.Audio(
                    sources=["microphone", "upload"],
                    type="filepath",
                    label="Record or Upload Audio"
                )

            transcribe_btn = gr.Button("πŸš€ Transcribe with Both Models", variant="primary", size="lg")

            gr.Markdown("---")

            with gr.Row():
                with gr.Column():
                    gr.Markdown(
                        '<div class="model-label" id="finetuned-label">✨ WHISPER-HEBRISH (Fine-Tuned)</div>'
                    )
                    finetuned_output = gr.Textbox(
                        label="Transcription",
                        placeholder="Fine-tuned model output will appear here...",
                        lines=8,
                        elem_id="finetuned-output"
                    )

                with gr.Column():
                    gr.Markdown(
                        '<div class="model-label" id="stock-label">πŸ“¦ WHISPER LARGE V3 TURBO (Stock)</div>'
                    )
                    stock_output = gr.Textbox(
                        label="Transcription",
                        placeholder="Stock model output will appear here...",
                        lines=8,
                        elem_id="stock-output"
                    )

            # Connect the button to the transcription function
            transcribe_btn.click(
                fn=transcribe_both,
                inputs=[audio_input],
                outputs=[finetuned_output, stock_output]
            )

        with gr.Tab("About"):
            gr.Markdown(
                """
                ## About This Demo

                This Space provides a **side-by-side comparison** of automatic speech recognition between:

                - **Whisper-Hebrish**: A fine-tuned version optimized for mixed Hebrew-English speech patterns
                - **Whisper Large V3 Turbo**: OpenAI's stock model for general-purpose transcription
                """
            )

            # Screenshots
            with gr.Row():
                with gr.Column():
                    gr.Image("screenshots/1.png", label="Demo Interface", show_label=True)
                with gr.Column():
                    gr.Image("screenshots/2.png", label="Comparison Results", show_label=True)

            gr.Markdown(
                """
                ## How to Use

                1. **Record** audio using your microphone or **upload** an audio file
                2. Click **"Transcribe with Both Models"**
                3. Compare the results side-by-side

                ## About Whisper-Hebrish

                Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages.

                ### Model Information

                πŸ€— **Model:** [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish)

                The fine-tuned model has been trained to better handle:
                - Mixed Hebrew-English speech
                - Switching between Hebrew and English
                - Hebrew-specific phonetic patterns
                - Common transliteration patterns

                ## Technical Details

                - Inference powered by Zero GPU
                - Both models run simultaneously for direct comparison
                - Supports both microphone recording and file upload
                - Based on Whisper Large V3 architecture

                ## Use Cases

                This comparison tool is useful for:
                - Evaluating model performance on bilingual content
                - Testing speech recognition accuracy for mixed-language scenarios
                - Understanding the benefits of fine-tuning for specific language patterns
                - Benchmarking ASR quality for mixed Hebrew-English speech

                ---

                *Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.*
                """
            )

if __name__ == "__main__":
    demo.launch()