Opera8 commited on
Commit
2010653
·
verified ·
1 Parent(s): 6f78923

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -35
app.py CHANGED
@@ -1,35 +1,262 @@
1
- gradio
2
- torch
3
- torchvision
4
- torchaudio
5
- spaces
6
- imageio-ffmpeg
7
- # SoniTranslate Core Dependencies
8
- git+https://github.com/m-bain/whisperX.git
9
- pyannote.audio>=3.3.2
10
- fairseq
11
- yt-dlp
12
- pysrt
13
- pydub
14
- faster-whisper
15
- audiostretchy
16
-
17
- # Translation and TTS
18
- google-generativeai
19
- openai
20
- edge-tts
21
- piper-tts==1.2.0
22
- TTS==0.21.1
23
-
24
- # Other utilities
25
- # Important: numpy must be <2 for audio libraries to work
26
- numpy<2
27
- soundfile
28
- librosa
29
- onnxruntime-gpu
30
- tqdm
31
- demucs
32
- python-multipart
33
- tenacity
34
- youtube-transcript-api
35
- ffmpeg-python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import subprocess
4
+
5
+ # --- FFmpeg Setup (Replaces packages.txt) ---
6
+ try:
7
+ import imageio_ffmpeg
8
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
9
+ ffmpeg_dir = os.path.dirname(ffmpeg_path)
10
+ # Add ffmpeg binary directory to system PATH so os.system("ffmpeg") works
11
+ os.environ["PATH"] += os.pathsep + ffmpeg_dir
12
+ # Ensure it's executable
13
+ subprocess.run(["chmod", "+x", ffmpeg_path])
14
+ print(f"✅ FFmpeg configured at: {ffmpeg_path}")
15
+ except ImportError:
16
+ print("⚠️ imageio-ffmpeg not found. Please add it to requirements.txt")
17
+
18
+ # --- Main Imports ---
19
+ import gradio as gr
20
+ import torch
21
+ import spaces # Required for ZeroGPU
22
+ from soni_translate.logging_setup import logger, set_logging_level, configure_logging_libs
23
+ configure_logging_libs()
24
+ import whisperx
25
+ from soni_translate.preprocessor import audio_video_preprocessor, audio_preprocessor
26
+ from soni_translate.postprocessor import media_out, get_no_ext_filename, sound_separate, get_subtitle_speaker
27
+ from soni_translate.speech_segmentation import transcribe_speech, align_speech, diarize_speech, ASR_MODEL_OPTIONS, find_whisper_models, diarization_models, COMPUTE_TYPE_CPU, COMPUTE_TYPE_GPU
28
+ from soni_translate.translate_segments import translate_text, TRANSLATION_PROCESS_OPTIONS
29
+ from soni_translate.text_to_speech import audio_segmentation_to_voice, edge_tts_voices_list, coqui_xtts_voices_list, piper_tts_voices_list
30
+ from soni_translate.audio_segments import create_translated_audio, accelerate_segments
31
+ from soni_translate.language_configuration import LANGUAGES, LANGUAGES_LIST
32
+ from soni_translate.utils import remove_files, get_link_list, get_valid_files, is_audio_file, is_subtitle_file
33
+ from soni_translate.text_multiformat_processor import process_subtitles, srt_file_to_segments, break_aling_segments
34
+ from soni_translate.languages_gui import language_data
35
+ import hashlib
36
+ import json
37
+ import copy
38
+ from pydub import AudioSegment
39
+
40
+ # Check for API key from Hugging Face Secrets
41
+ if "GOOGLE_API_KEY" in os.environ:
42
+ print("✅ Google API Key found in secrets.")
43
+ else:
44
+ print("⚠️ Google API Key not found. Please set it in the Space secrets.")
45
+
46
+ if "OPENAI_API_KEY" in os.environ:
47
+ print("✅ OpenAI API Key found in secrets.")
48
+ else:
49
+ print("⚠️ OpenAI API Key not found. Please set it in the Space secrets if you use OpenAI models.")
50
+
51
+
52
+ # Create necessary directories
53
+ directories = ["downloads", "logs", "weights", "clean_song_output", "_XTTS_", "audio", "outputs"]
54
+ for directory in directories:
55
+ if not os.path.exists(directory):
56
+ os.makedirs(directory)
57
+
58
+ class SoniTranslate:
59
+ def __init__(self):
60
+ # Device detection moved inside the function for ZeroGPU compatibility
61
+ self.result_diarize = None
62
+ self.align_language = None
63
+ self.result_source_lang = None
64
+ self.tts_info = self._get_tts_info()
65
+
66
+ def _get_tts_info(self):
67
+ # Simplified for this example
68
+ class TTS_Info:
69
+ def tts_list(self):
70
+ try:
71
+ return edge_tts_voices_list()
72
+ except Exception as e:
73
+ logger.warning(f"Could not get Edge-TTS voices: {e}")
74
+ return ["en-US-JennyNeural-Female"] # fallback
75
+ return TTS_Info()
76
+
77
+ # --- ZeroGPU Decorator ---
78
+ # duration=300 means 5 minutes max per request. Adjust if needed.
79
+ @spaces.GPU(duration=300)
80
+ def multilingual_media_conversion(
81
+ self,
82
+ media_file,
83
+ link_media,
84
+ directory_input,
85
+ origin_language,
86
+ target_language,
87
+ tts_voice,
88
+ transcriber_model,
89
+ max_speakers,
90
+ is_gui=True,
91
+ progress=gr.Progress(),
92
+ ):
93
+ # Check device inside the GPU decorated function
94
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
95
+ logger.info(f"Working on device: {self.device}")
96
+
97
+ try:
98
+ progress(0.05, desc="Starting process...")
99
+
100
+ # 1. Handle Input
101
+ input_media = None
102
+ if media_file is not None:
103
+ input_media = media_file.name
104
+ elif link_media:
105
+ input_media = link_media
106
+ elif directory_input and os.path.exists(directory_input):
107
+ input_media = directory_input
108
+
109
+ if not input_media:
110
+ raise ValueError("No input media specified. Please upload a file or provide a URL.")
111
+
112
+ base_audio_wav = "audio.wav"
113
+ base_video_file = "video.mp4"
114
+
115
+ remove_files(base_audio_wav, base_video_file)
116
+
117
+ progress(0.1, desc="Processing input media...")
118
+ if is_audio_file(input_media):
119
+ audio_preprocessor(False, input_media, base_audio_wav)
120
+ else:
121
+ audio_video_preprocessor(False, input_media, base_video_file, base_audio_wav)
122
+
123
+ # 2. Transcription
124
+ progress(0.25, desc="Transcribing audio with WhisperX...")
125
+ source_lang_code = LANGUAGES[origin_language] if origin_language != "Automatic detection" else None
126
+
127
+ # Force float16 if cuda is available (ZeroGPU)
128
+ compute_type = "float16" if self.device == "cuda" else "int8"
129
+
130
+ audio, result = transcribe_speech(
131
+ base_audio_wav,
132
+ transcriber_model,
133
+ compute_type,
134
+ 16,
135
+ source_lang_code
136
+ )
137
+
138
+ progress(0.4, desc="Aligning transcription...")
139
+ self.align_language = result["language"]
140
+ result = align_speech(audio, result)
141
+
142
+ # 3. Diarization
143
+ progress(0.5, desc="Separating speakers...")
144
+ hf_token = os.environ.get("HF_TOKEN")
145
+ if not hf_token:
146
+ logger.warning("Hugging Face token not found. Diarization might fail.")
147
+
148
+ self.result_diarize = diarize_speech(
149
+ base_audio_wav,
150
+ result,
151
+ 1,
152
+ max_speakers,
153
+ hf_token,
154
+ diarization_models["pyannote_3.1"]
155
+ )
156
+ self.result_source_lang = copy.deepcopy(self.result_diarize)
157
+
158
+ # 4. Translation
159
+ progress(0.6, desc="Translating text...")
160
+ translate_to_code = LANGUAGES[target_language]
161
+ self.result_diarize["segments"] = translate_text(
162
+ self.result_diarize["segments"],
163
+ translate_to_code,
164
+ "google_translator_batch",
165
+ chunk_size=1800,
166
+ source=self.align_language,
167
+ )
168
+
169
+ # 5. Text-to-Speech
170
+ progress(0.75, desc="Generating dubbed audio...")
171
+ valid_speakers = audio_segmentation_to_voice(
172
+ self.result_diarize,
173
+ translate_to_code,
174
+ is_gui,
175
+ tts_voice
176
+ )
177
+
178
+ # 6. Audio Processing & Merging
179
+ progress(0.85, desc="Synchronizing and mixing audio...")
180
+ dub_audio_file = "audio_dub_solo.ogg"
181
+ remove_files(dub_audio_file)
182
+ audio_files, _ = accelerate_segments(self.result_diarize, 1.8, valid_speakers)
183
+ create_translated_audio(self.result_diarize, audio_files, dub_audio_file, False, False)
184
+
185
+ mix_audio_file = "audio_mix.mp3"
186
+ remove_files(mix_audio_file)
187
+
188
+ # Using os.system which relies on the PATH set at the top
189
+ command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume=0.1[a];[1:0]volume=1.5[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}'
190
+ os.system(command_volume_mix)
191
+
192
+ # 7. Final Video Creation
193
+ progress(0.95, desc="Creating final video...")
194
+ output_filename = "video_dub.mp4"
195
+ remove_files(output_filename)
196
+
197
+ if os.path.exists(base_video_file):
198
+ os.system(f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {output_filename}")
199
+ final_output = media_out(input_media, translate_to_code, "", "mp4", file_obj=output_filename)
200
+ else:
201
+ final_output = media_out(input_media, translate_to_code, "", "mp3", file_obj=mix_audio_file)
202
+
203
+ progress(1.0, desc="Done!")
204
+ return final_output
205
+
206
+ except Exception as e:
207
+ logger.error(f"An error occurred: {e}")
208
+ gr.Error(f"An error occurred: {e}")
209
+ return None
210
+
211
+ # Instantiate the class
212
+ SoniTr = SoniTranslate()
213
+
214
+ # Create Gradio Interface
215
+ with gr.Blocks(theme="Taithrah/Minimal") as app:
216
+ gr.Markdown("<center><h1>📽️ ابزار دوبله ویدیو با هوش مصنوعی 🈷️</h1></center>")
217
+ gr.Markdown("ساخته شده توسط [aigolden](https://youtube.com/@aigolden) - بر پایه [SoniTranslate](https://github.com/r3gm/SoniTranslate)")
218
+
219
+ with gr.Row():
220
+ with gr.Column():
221
+ gr.Markdown("### ۱. ورودی ویدیو")
222
+ video_file_input = gr.File(label="آپلود ویدیو")
223
+ link_media_input = gr.Textbox(label="یا لینک یوتیوب", placeholder="https://www.youtube.com/watch?v=...")
224
+
225
+ gr.Markdown("### ۲. تنظیمات دوبله")
226
+ origin_language_input = gr.Dropdown(LANGUAGES_LIST, value="Automatic detection", label="زبان اصلی ویدیو")
227
+ target_language_input = gr.Dropdown(LANGUAGES_LIST[1:], value="Persian (fa)", label="زبان مقصد دوبله")
228
+ tts_voice_input = gr.Dropdown(SoniTr.tts_info.tts_list(), value="fa-IR-FaridNeural", label="صدای گوینده")
229
+
230
+ with gr.Accordion("تنظیمات پیشرفته", open=False):
231
+ transcriber_model_input = gr.Dropdown(
232
+ ASR_MODEL_OPTIONS + find_whisper_models(),
233
+ value="large-v3",
234
+ label="مدل استخراج متن (Whisper)",
235
+ info="مدل‌های بزرگتر دقیق‌تر اما کندتر هستند."
236
+ )
237
+ max_speakers_input = gr.Slider(1, 10, value=2, step=1, label="حداکثر تعداد گوینده")
238
+
239
+ process_button = gr.Button("شروع دوبله", variant="primary")
240
+
241
+ with gr.Column():
242
+ gr.Markdown("### ۳. خروجی")
243
+ output_video = gr.Video(label="ویدیوی دوبله شده")
244
+ output_file = gr.File(label="دانلود فایل")
245
+
246
+ process_button.click(
247
+ SoniTr.multilingual_media_conversion,
248
+ inputs=[
249
+ video_file_input,
250
+ link_media_input,
251
+ gr.Textbox(visible=False),
252
+ origin_language_input,
253
+ target_language_input,
254
+ tts_voice_input,
255
+ transcriber_model_input,
256
+ max_speakers_input,
257
+ ],
258
+ outputs=[output_file]
259
+ )
260
+
261
+ if __name__ == "__main__":
262
+ app.launch(server_name="0.0.0.0", server_port=7860)