Spaces:

danielrosehill
/

Whisper-Hebrish

Sleeping

File size: 7,401 Bytes

import gradio as gr
import spaces
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch

# Model IDs
FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish"
STOCK_MODEL_ID = "openai/whisper-large-v3-turbo"

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load fine-tuned model
print("Loading fine-tuned model...")
finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    FINETUNED_MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
finetuned_model.to(device)
finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID)
finetuned_pipe = pipeline(
    "automatic-speech-recognition",
    model=finetuned_model,
    tokenizer=finetuned_processor.tokenizer,
    feature_extractor=finetuned_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

# Load stock model
print("Loading stock model...")
stock_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    STOCK_MODEL_ID,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
stock_model.to(device)
stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID)
stock_pipe = pipeline(
    "automatic-speech-recognition",
    model=stock_model,
    tokenizer=stock_processor.tokenizer,
    feature_extractor=stock_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

@spaces.GPU
def transcribe_both(audio):
    """Transcribe audio using both fine-tuned and stock models."""
    if audio is None:
        return "Please record or upload audio.", "Please record or upload audio."

    # Transcribe with fine-tuned model
    finetuned_result = finetuned_pipe(audio)
    finetuned_text = finetuned_result["text"]

    # Transcribe with stock model
    stock_result = stock_pipe(audio)
    stock_text = stock_result["text"]

    return finetuned_text, stock_text

# Custom CSS for styling
custom_css = """
#finetuned-output {
    border: 3px solid #10b981 !important;
    border-radius: 8px !important;
    padding: 16px !important;
}

#stock-output {
    border: 3px solid #3b82f6 !important;
    border-radius: 8px !important;
    padding: 16px !important;
}

.model-label {
    font-size: 24px !important;
    font-weight: bold !important;
    margin-bottom: 12px !important;
}

#finetuned-label {
    color: #10b981 !important;
}

#stock-label {
    color: #3b82f6 !important;
}

.gradio-container {
    max-width: 1200px !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo:
    gr.Markdown("# Whisper Hebrish: Side By Side Demo")

    with gr.Tabs():
        with gr.Tab("Compare Models"):
            gr.Markdown(
                """
                Compare the performance of **Whisper-Hebrish** (fine-tuned for mixed language speech)
                with the stock **Whisper Large V3 Turbo** model.

                **Record or upload audio** to see side-by-side transcription results.
                """
            )

            with gr.Row():
                audio_input = gr.Audio(
                    sources=["microphone", "upload"],
                    type="filepath",
                    label="Record or Upload Audio"
                )

            transcribe_btn = gr.Button("🚀 Transcribe with Both Models", variant="primary", size="lg")

            gr.Markdown("---")

            with gr.Row():
                with gr.Column():
                    gr.Markdown(
                        '<div class="model-label" id="finetuned-label">✨ WHISPER-HEBRISH (Fine-Tuned)</div>'
                    )
                    finetuned_output = gr.Textbox(
                        label="Transcription",
                        placeholder="Fine-tuned model output will appear here...",
                        lines=8,
                        elem_id="finetuned-output"
                    )

                with gr.Column():
                    gr.Markdown(
                        '<div class="model-label" id="stock-label">📦 WHISPER LARGE V3 TURBO (Stock)</div>'
                    )
                    stock_output = gr.Textbox(
                        label="Transcription",
                        placeholder="Stock model output will appear here...",
                        lines=8,
                        elem_id="stock-output"
                    )

            # Connect the button to the transcription function
            transcribe_btn.click(
                fn=transcribe_both,
                inputs=[audio_input],
                outputs=[finetuned_output, stock_output]
            )

        with gr.Tab("About"):
            gr.Markdown(
                """
                ## About This Demo

                This Space provides a **side-by-side comparison** of automatic speech recognition between:

                - **Whisper-Hebrish**: A fine-tuned version optimized for mixed Hebrew-English speech patterns
                - **Whisper Large V3 Turbo**: OpenAI's stock model for general-purpose transcription
                """
            )

            # Screenshots
            with gr.Row():
                with gr.Column():
                    gr.Image("screenshots/1.png", label="Demo Interface", show_label=True)
                with gr.Column():
                    gr.Image("screenshots/2.png", label="Comparison Results", show_label=True)

            gr.Markdown(
                """
                ## How to Use

                1. **Record** audio using your microphone or **upload** an audio file
                2. Click **"Transcribe with Both Models"**
                3. Compare the results side-by-side

                ## About Whisper-Hebrish

                Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages.

                ### Model Information

                🤗 **Model:** [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish)

                The fine-tuned model has been trained to better handle:
                - Mixed Hebrew-English speech
                - Switching between Hebrew and English
                - Hebrew-specific phonetic patterns
                - Common transliteration patterns

                ## Technical Details

                - Inference powered by Zero GPU
                - Both models run simultaneously for direct comparison
                - Supports both microphone recording and file upload
                - Based on Whisper Large V3 architecture

                ## Use Cases

                This comparison tool is useful for:
                - Evaluating model performance on bilingual content
                - Testing speech recognition accuracy for mixed-language scenarios
                - Understanding the benefits of fine-tuning for specific language patterns
                - Benchmarking ASR quality for mixed Hebrew-English speech

                ---

                *Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.*
                """
            )

if __name__ == "__main__":
    demo.launch()