from typing import Tuple, Optional import tempfile import numpy as numpy_module import soundfile as soundfile_module import torch import gradio as gradio_module from PIL import Image import easyocr from transformers import ( pipeline, VitsModel, AutoTokenizer, ) device_string: str = "cpu" ocr_reader = easyocr.Reader( ["en"], gpu=False, ) def run_ocr(image_object: Image.Image) -> str: """ OCR для печатного английского текста. """ if image_object is None: return "" rgb_image_object: Image.Image = image_object.convert("RGB") numpy_image = numpy_module.array(rgb_image_object) ocr_results = ocr_reader.readtext( numpy_image, detail=0, paragraph=True, ) text_parts = [str(text_value) for text_value in ocr_results if text_value] recognized_text: str = "\n".join(text_parts).strip() return recognized_text text_classifier_pipeline = pipeline( task="text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", ) def run_text_classification(input_text: str) -> str: """ Анализ текста трансформером. """ cleaned_text: str = input_text.strip() if not cleaned_text: return "" classifier_result_list = text_classifier_pipeline( cleaned_text, truncation=True, max_length=512, ) classifier_result = classifier_result_list[0] label_value: str = str(classifier_result.get("label", "")) score_value: float = float(classifier_result.get("score", 0.0)) classification_text: str = f"{label_value} (score={score_value:.3f})" return classification_text summary_pipeline = pipeline( task="summarization", model="sshleifer/distilbart-cnn-12-6", ) def run_summarization( input_text: str, max_summary_tokens: int = 128, ) -> str: """ Английская суммаризация. """ cleaned_text: str = input_text.strip() if not cleaned_text: return "" word_count: int = len(cleaned_text.split()) dynamic_max_length: int = min( max_summary_tokens, max(32, word_count + 20), ) if word_count < 8: return cleaned_text summary_result_list = summary_pipeline( cleaned_text, max_length=dynamic_max_length, min_length=max(10, dynamic_max_length // 3), do_sample=False, ) summary_text: str = summary_result_list[0]["summary_text"].strip() return summary_text tts_model: VitsModel = VitsModel.from_pretrained("facebook/mms-tts-eng") tts_tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") tts_model.to(device_string) def run_tts(summary_text: str) -> Optional[str]: """ Озвучка английского текста конспекта через VitsModel (facebook/mms-tts-eng). """ cleaned_text: str = summary_text.strip() if not cleaned_text: return None tokenized_inputs = tts_tokenizer( cleaned_text, return_tensors="pt", ) tokenized_inputs = { key: value.to(device_string) for key, value in tokenized_inputs.items() } input_ids_tensor = tokenized_inputs.get("input_ids") if input_ids_tensor is None or input_ids_tensor.numel() == 0: return None try: with torch.no_grad(): model_output = tts_model(**tokenized_inputs) waveform_tensor = model_output.waveform # (batch, n_samples) except RuntimeError as runtime_error: print(f"[WARN] TTS RuntimeError: {runtime_error}") return None waveform_array = waveform_tensor.squeeze().cpu().numpy().astype("float32") waveform_array = numpy_module.clip(waveform_array, -1.0, 1.0) with tempfile.NamedTemporaryFile( suffix=".wav", delete=False, ) as temporary_file: soundfile_module.write( temporary_file.name, waveform_array, tts_model.config.sampling_rate, ) file_path: str = temporary_file.name return file_path def full_flow( image_object: Image.Image, max_summary_tokens: int = 128, ) -> Tuple[str, str, str, Optional[str]]: """ 1) OCR 2) Классификация текста 3) Суммаризация 4) TTS """ recognized_text: str = run_ocr(image_object=image_object) classification_text: str = run_text_classification(recognized_text) summary_text: str = run_summarization( input_text=recognized_text, max_summary_tokens=max_summary_tokens, ) audio_file_path: Optional[str] = run_tts(summary_text=summary_text) return recognized_text, classification_text, summary_text, audio_file_path gradio_interface = gradio_module.Interface( fn=full_flow, inputs=[ gradio_module.Image( type="pil", label="Изображение с напечатанным английским текстом", ), gradio_module.Slider( minimum=32, maximum=256, value=128, step=16, label="Максимальная длина конспекта (токены, примерно)", ), ], outputs=[ gradio_module.Textbox( label="Распознанный текст (OCR, easyocr)", lines=8, ), gradio_module.Textbox( label="Анализ текста (классификация, DistilBERT)", lines=2, ), gradio_module.Textbox( label="Конспект (английский текст, DistilBART)", lines=6, ), gradio_module.Audio( label="Озвучка конспекта (английский TTS, VITS)", type="filepath", ), ], title="Картинка → Текст → Анализ → Конспект → Озвучка", description=( "1) easyocr распознаёт печатный английский текст с картинки.\n" "2) Трансформер-классификатор (DistilBERT) оценивает тон текста.\n" "3) Трансформер-суммаризатор (DistilBART) делает краткий конспект.\n" "4) Трансформер TTS (MMS VITS) озвучивает конспект." ), ) if __name__ == "__main__": gradio_interface.launch()