Spaces:

danielrosehill
/

Whisper-Hebrish

Sleeping

App Files Files Community

Whisper-Hebrish / app.py

danielrosehill

commit

8701643 28 days ago

raw

history blame contribute delete

7.4 kB

	import gradio as gr
	import spaces
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import torch

	# Model IDs
	FINETUNED_MODEL_ID = "danielrosehill/Whisper-Hebrish"
	STOCK_MODEL_ID = "openai/whisper-large-v3-turbo"

	# Device configuration
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Load fine-tuned model
	print("Loading fine-tuned model...")
	finetuned_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	FINETUNED_MODEL_ID,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	finetuned_model.to(device)
	finetuned_processor = AutoProcessor.from_pretrained(FINETUNED_MODEL_ID)
	finetuned_pipe = pipeline(
	"automatic-speech-recognition",
	model=finetuned_model,
	tokenizer=finetuned_processor.tokenizer,
	feature_extractor=finetuned_processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	)

	# Load stock model
	print("Loading stock model...")
	stock_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	STOCK_MODEL_ID,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	stock_model.to(device)
	stock_processor = AutoProcessor.from_pretrained(STOCK_MODEL_ID)
	stock_pipe = pipeline(
	"automatic-speech-recognition",
	model=stock_model,
	tokenizer=stock_processor.tokenizer,
	feature_extractor=stock_processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	)

	@spaces.GPU
	def transcribe_both(audio):
	"""Transcribe audio using both fine-tuned and stock models."""
	if audio is None:
	return "Please record or upload audio.", "Please record or upload audio."

	# Transcribe with fine-tuned model
	finetuned_result = finetuned_pipe(audio)
	finetuned_text = finetuned_result["text"]

	# Transcribe with stock model
	stock_result = stock_pipe(audio)
	stock_text = stock_result["text"]

	return finetuned_text, stock_text

	# Custom CSS for styling
	custom_css = """
	#finetuned-output {
	border: 3px solid #10b981 !important;
	border-radius: 8px !important;
	padding: 16px !important;
	}

	#stock-output {
	border: 3px solid #3b82f6 !important;
	border-radius: 8px !important;
	padding: 16px !important;
	}

	.model-label {
	font-size: 24px !important;
	font-weight: bold !important;
	margin-bottom: 12px !important;
	}

	#finetuned-label {
	color: #10b981 !important;
	}

	#stock-label {
	color: #3b82f6 !important;
	}

	.gradio-container {
	max-width: 1200px !important;
	}
	"""

	# Create Gradio interface
	with gr.Blocks(css=custom_css, title="Whisper Hebrish: Side By Side Demo") as demo:
	gr.Markdown("# Whisper Hebrish: Side By Side Demo")

	with gr.Tabs():
	with gr.Tab("Compare Models"):
	gr.Markdown(
	"""
	Compare the performance of Whisper-Hebrish (fine-tuned for mixed language speech)
	with the stock Whisper Large V3 Turbo model.

	Record or upload audio to see side-by-side transcription results.
	"""
	)

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Record or Upload Audio"
	)

	transcribe_btn = gr.Button("🚀 Transcribe with Both Models", variant="primary", size="lg")

	gr.Markdown("---")

	with gr.Row():
	with gr.Column():
	gr.Markdown(
	'<div class="model-label" id="finetuned-label">✨ WHISPER-HEBRISH (Fine-Tuned)</div>'
	)
	finetuned_output = gr.Textbox(
	label="Transcription",
	placeholder="Fine-tuned model output will appear here...",
	lines=8,
	elem_id="finetuned-output"
	)

	with gr.Column():
	gr.Markdown(
	'<div class="model-label" id="stock-label">📦 WHISPER LARGE V3 TURBO (Stock)</div>'
	)
	stock_output = gr.Textbox(
	label="Transcription",
	placeholder="Stock model output will appear here...",
	lines=8,
	elem_id="stock-output"
	)

	# Connect the button to the transcription function
	transcribe_btn.click(
	fn=transcribe_both,
	inputs=[audio_input],
	outputs=[finetuned_output, stock_output]
	)

	with gr.Tab("About"):
	gr.Markdown(
	"""
	## About This Demo

	This Space provides a side-by-side comparison of automatic speech recognition between:

	- Whisper-Hebrish: A fine-tuned version optimized for mixed Hebrew-English speech patterns
	- Whisper Large V3 Turbo: OpenAI's stock model for general-purpose transcription
	"""
	)

	# Screenshots
	with gr.Row():
	with gr.Column():
	gr.Image("screenshots/1.png", label="Demo Interface", show_label=True)
	with gr.Column():
	gr.Image("screenshots/2.png", label="Comparison Results", show_label=True)

	gr.Markdown(
	"""
	## How to Use

	1. Record audio using your microphone or upload an audio file
	2. Click "Transcribe with Both Models"
	3. Compare the results side-by-side

	## About Whisper-Hebrish

	Whisper-Hebrish is a fine-tuned version of OpenAI's Whisper model, specifically optimized for handling mixed language speech patterns common in Hebrew-English conversations and switching between languages.

	### Model Information

	🤗 Model: [danielrosehill/Whisper-Hebrish](https://huggingface.co/danielrosehill/Whisper-Hebrish)

	The fine-tuned model has been trained to better handle:
	- Mixed Hebrew-English speech
	- Switching between Hebrew and English
	- Hebrew-specific phonetic patterns
	- Common transliteration patterns

	## Technical Details

	- Inference powered by Zero GPU
	- Both models run simultaneously for direct comparison
	- Supports both microphone recording and file upload
	- Based on Whisper Large V3 architecture

	## Use Cases

	This comparison tool is useful for:
	- Evaluating model performance on bilingual content
	- Testing speech recognition accuracy for mixed-language scenarios
	- Understanding the benefits of fine-tuning for specific language patterns
	- Benchmarking ASR quality for mixed Hebrew-English speech

	---

	Compare accuracy, handling of language mixing, and transcription quality between the fine-tuned and stock models.
	"""
	)

	if __name__ == "__main__":
	demo.launch()