Spaces:
Runtime error
Runtime error
| import sys | |
| import time | |
| try: | |
| import spaces | |
| except ImportError: | |
| print("ZeroGPU is not available, skipping...") | |
| import torch | |
| import torchaudio | |
| import gradio as gr | |
| import torchaudio.transforms as T | |
| import polars as pl | |
| from importlib.metadata import version | |
| from gradio.utils import is_zero_gpu_space | |
| from gradio.themes import Base | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| AutoProcessor, | |
| MoonshineForConditionalGeneration, | |
| ) | |
| from doctr.io import DocumentFile | |
| from doctr.models import ocr_predictor | |
| use_zero_gpu = is_zero_gpu_space() | |
| use_cuda = torch.cuda.is_available() | |
| if use_zero_gpu: | |
| spaces_version = version("spaces") | |
| print("ZeroGPU is available, changing inference call.") | |
| else: | |
| spaces_version = "N/A" | |
| print("ZeroGPU is not available, skipping...") | |
| print(f"Spaces version: {spaces_version}") | |
| if use_cuda: | |
| print("CUDA is available, setting correct `device` variable.") | |
| device = "cuda" | |
| torch_dtype = torch.bfloat16 | |
| else: | |
| device = "cpu" | |
| torch_dtype = torch.bfloat16 | |
| # Config | |
| model_name = "Yehor/kulyk-en-uk" | |
| concurrency_limit = 5 | |
| current_theme = Base() | |
| # Load the model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| device_map=device, | |
| torch_dtype=torch_dtype, | |
| trust_remote_code=True, | |
| ) | |
| model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Load ASR | |
| audio_processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base") | |
| audio_model = MoonshineForConditionalGeneration.from_pretrained( | |
| "UsefulSensors/moonshine-base", attn_implementation="sdpa" | |
| ) | |
| audio_model.to(device) | |
| audio_model.to(torch_dtype) | |
| # Load OCR | |
| ocr_model = ocr_predictor(pretrained=True) | |
| ocr_model.to(device) | |
| # Examples | |
| examples_text = [ | |
| "WP: F-16s are unlikely to make a significant difference on the battlefield", | |
| "Missile and 7 of 8 Shaheeds shot down over Ukraine", | |
| "Olympic Games 2024. Schedule of competitions for Ukrainian athletes on 28 July", | |
| "Harris' campaign raised more than $200 million in less than a week", | |
| "Over the week, the NBU sold almost $800 million on the interbank market", | |
| "Paris 2024. Day 2: Text broadcast", | |
| ] | |
| examples_audio = [ | |
| "example_1.wav", | |
| "example_2.wav", | |
| "example_3.wav", | |
| "example_4.wav", | |
| "example_5.wav", | |
| "example_6.wav", | |
| "example_7.wav", | |
| ] | |
| examples_image = [ | |
| "example_1.jpg", | |
| "example_2.jpg", | |
| "example_3.jpg", | |
| "example_4.jpg", | |
| "example_5.jpg", | |
| "example_6.jpg", | |
| ] | |
| title = "EN-UK Translator" | |
| authors_table = """ | |
| ## Authors | |
| Follow them on social networks and **contact** if you need any help or have any questions: | |
| | <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** | | |
| |-------------------------------------------------------------------------------------------------| | |
| | https://t.me/smlkw in Telegram | | |
| | https://x.com/yehor_smoliakov at X | | |
| | https://github.com/egorsmkv at GitHub | | |
| | https://huggingface.co/Yehor at Hugging Face | | |
| | or use [email protected] | | |
| """.strip() | |
| description_head = f""" | |
| # {title} | |
| This space translates your text, audio, image from English to Ukrainian using [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk) model. Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out. | |
| """.strip() | |
| tech_env = f""" | |
| #### Environment | |
| - Python: {sys.version} | |
| #### Models | |
| - [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk) | |
| - [moonshine-base](https://huggingface.co/UsefulSensors/moonshine-base) | |
| - [doctr](https://github.com/mindee/doctr) | |
| """.strip() | |
| tech_libraries = f""" | |
| #### Libraries | |
| - torch: {version("torch")} | |
| - gradio: {version("gradio")} | |
| - transformers: {version("transformers")} | |
| """.strip() | |
| def translate(text: str) -> str: | |
| prompt = "Translate the text to Ukrainian:\n" + text | |
| input_ids = tokenizer.apply_chat_template( | |
| [{"role": "user", "content": prompt}], | |
| add_generation_prompt=True, | |
| return_tensors="pt", | |
| tokenize=True, | |
| ).to(model.device) | |
| output = model.generate( | |
| input_ids, | |
| max_new_tokens=2048, | |
| # Greedy Search | |
| do_sample=False, | |
| repetition_penalty=1.05, | |
| # Sampling | |
| # do_sample=True, | |
| # temperature=0.1, | |
| # # top_k=1, | |
| # min_p=0.9, | |
| # repetition_penalty=1.05, | |
| ) | |
| prompt_len = input_ids.shape[1] | |
| generated_tokens = output[:, prompt_len:] | |
| translated_text = tokenizer.batch_decode( | |
| generated_tokens, skip_special_tokens=True | |
| )[0] | |
| return translated_text.strip() | |
| def inference_text(text, progress=gr.Progress()): | |
| if not text: | |
| raise gr.Error("Please paste your text.") | |
| progress(0, desc="Translating...") | |
| results = [] | |
| sentences = text.split("\n") | |
| non_empty_sentences = [] | |
| for sentence in sentences: | |
| s = sentence.strip() | |
| if len(s) != 0: | |
| non_empty_sentences.append(s) | |
| for sentence in progress.tqdm( | |
| non_empty_sentences, desc="Translating...", unit="sentence" | |
| ): | |
| t0 = time.time() | |
| translated_text = translate(sentence) | |
| elapsed_time = round(time.time() - t0, 2) | |
| translated_text = translated_text.strip() | |
| results.append( | |
| { | |
| # "sentence": sentence, | |
| "text": translated_text, | |
| # "elapsed_time": elapsed_time, | |
| } | |
| ) | |
| gr.Info("Finished!", duration=2) | |
| return pl.DataFrame(results) | |
| def inference_audio(audio, progress=gr.Progress()): | |
| if not audio: | |
| raise gr.Error("Please paste your audio file.") | |
| progress(0, desc="Translating...") | |
| if isinstance(audio, str): | |
| audio_array, sr = torchaudio.load(audio) | |
| audio_array = audio_array.squeeze() | |
| else: | |
| audio_array, sr = audio | |
| r_sr = audio_processor.feature_extractor.sampling_rate | |
| print("Audio processor SR:", r_sr) | |
| print("Audio file SR:", sr) | |
| if r_sr != sr: | |
| print("Resampling...") | |
| resampler = T.Resample(orig_freq=sr, new_freq=r_sr) | |
| audio_array = resampler(audio_array) | |
| inputs = audio_processor(audio_array, return_tensors="pt", sampling_rate=r_sr) | |
| inputs = inputs.to(device, dtype=torch_dtype) | |
| # to avoid hallucination loops, we limit the maximum length of the generated text based expected number of tokens per second | |
| token_limit_factor = ( | |
| 6.5 / audio_processor.feature_extractor.sampling_rate | |
| ) # Maximum of 6.5 tokens per second | |
| seq_lens = inputs.attention_mask.sum(dim=-1) | |
| max_length = int((seq_lens * token_limit_factor).max().item()) | |
| generated_ids = audio_model.generate(**inputs, max_length=max_length) | |
| predictions = audio_processor.batch_decode(generated_ids, skip_special_tokens=True) | |
| print("Predictions:", predictions) | |
| text = predictions[0] | |
| print("Text:", text) | |
| results = [] | |
| sentences = text.split("\n") | |
| non_empty_sentences = [] | |
| for sentence in sentences: | |
| s = sentence.strip() | |
| if len(s) != 0: | |
| non_empty_sentences.append(s) | |
| for sentence in progress.tqdm( | |
| non_empty_sentences, desc="Translating...", unit="sentence" | |
| ): | |
| t0 = time.time() | |
| translated_text = translate(sentence) | |
| elapsed_time = round(time.time() - t0, 2) | |
| results.append( | |
| { | |
| # "sentence": sentence, | |
| "text": translated_text, | |
| # "elapsed_time": elapsed_time, | |
| } | |
| ) | |
| gr.Info("Finished!", duration=2) | |
| return pl.DataFrame(results) | |
| def inference_image(image, progress=gr.Progress()): | |
| if not image: | |
| raise gr.Error("Please paste your image file.") | |
| progress(0, desc="Translating...") | |
| if isinstance(image, str): | |
| doc = DocumentFile.from_images(image) | |
| else: | |
| raise gr.Error("Please paste your image file.") | |
| result = ocr_model(doc) | |
| text = result.render() | |
| print("Text:", text) | |
| results = [] | |
| sentences = [text.replace("\n", " ")] | |
| for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"): | |
| t0 = time.time() | |
| translated_text = translate(sentence) | |
| elapsed_time = round(time.time() - t0, 2) | |
| results.append( | |
| { | |
| # "sentence": sentence, | |
| "text": translated_text, | |
| # "elapsed_time": elapsed_time, | |
| } | |
| ) | |
| gr.Info("Finished!", duration=2) | |
| return pl.DataFrame(results) | |
| def create_app(): | |
| tab = gr.Blocks( | |
| title=title, | |
| analytics_enabled=False, | |
| theme=current_theme, | |
| ) | |
| with tab: | |
| gr.Markdown(description_head) | |
| gr.Markdown("## Usage") | |
| translated_text = gr.DataFrame( | |
| label="Translated text", | |
| ) | |
| text = gr.Textbox(label="Text", autofocus=True, lines=5) | |
| gr.Button("Translate").click( | |
| inference_text, | |
| concurrency_limit=concurrency_limit, | |
| inputs=text, | |
| outputs=translated_text, | |
| ) | |
| with gr.Row(): | |
| gr.Examples(label="Choose an example", inputs=text, examples=examples_text) | |
| return tab | |
| def create_audio_app(): | |
| with gr.Blocks(theme=current_theme) as tab: | |
| gr.Markdown(description_head) | |
| gr.Markdown("## Usage") | |
| translated_text = gr.DataFrame( | |
| label="Translated text", | |
| ) | |
| audio = gr.Audio(label="Audio file", sources="upload", type="filepath") | |
| gr.Button("Translate").click( | |
| inference_audio, | |
| concurrency_limit=concurrency_limit, | |
| inputs=audio, | |
| outputs=translated_text, | |
| ) | |
| with gr.Row(): | |
| gr.Examples( | |
| label="Choose an example", inputs=audio, examples=examples_audio | |
| ) | |
| return tab | |
| def create_image_app(): | |
| with gr.Blocks(theme=current_theme) as tab: | |
| gr.Markdown(description_head) | |
| gr.Markdown("## Usage") | |
| translated_text = gr.DataFrame( | |
| label="Translated text", | |
| ) | |
| image = gr.Image(label="Image file", sources="upload", type="filepath") | |
| gr.Button("Translate").click( | |
| inference_image, | |
| concurrency_limit=concurrency_limit, | |
| inputs=image, | |
| outputs=translated_text, | |
| ) | |
| with gr.Row(): | |
| gr.Examples( | |
| label="Choose an example", inputs=image, examples=examples_image | |
| ) | |
| return tab | |
| def create_env(): | |
| with gr.Blocks(theme=current_theme) as tab: | |
| gr.Markdown(tech_env) | |
| gr.Markdown(tech_libraries) | |
| return tab | |
| def create_authors(): | |
| with gr.Blocks(theme=current_theme) as tab: | |
| gr.Markdown(authors_table) | |
| return tab | |
| def create_demo(): | |
| app_tab = create_app() | |
| app_audio_tab = create_audio_app() | |
| app_image_tab = create_image_app() | |
| authors_tab = create_authors() | |
| env_tab = create_env() | |
| return gr.TabbedInterface( | |
| [app_tab, app_audio_tab, app_image_tab, authors_tab, env_tab], | |
| tab_names=[ | |
| "βοΈ Text", | |
| "π Audio", | |
| "π Image", | |
| "π₯ Authors", | |
| "π¦ Environment, Models, and Libraries", | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo = create_demo() | |
| demo.queue() | |
| demo.launch() | |