import os import sys import subprocess # --- FFmpeg Setup (Replaces packages.txt) --- try: import imageio_ffmpeg ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe() ffmpeg_dir = os.path.dirname(ffmpeg_path) # Add ffmpeg binary directory to system PATH so os.system("ffmpeg") works os.environ["PATH"] += os.pathsep + ffmpeg_dir # Ensure it's executable subprocess.run(["chmod", "+x", ffmpeg_path]) print(f"✅ FFmpeg configured at: {ffmpeg_path}") except ImportError: print("⚠️ imageio-ffmpeg not found. Please add it to requirements.txt") # --- Main Imports --- import gradio as gr import torch import spaces # Required for ZeroGPU from soni_translate.logging_setup import logger, set_logging_level, configure_logging_libs configure_logging_libs() import whisperx from soni_translate.preprocessor import audio_video_preprocessor, audio_preprocessor from soni_translate.postprocessor import media_out, get_no_ext_filename, sound_separate, get_subtitle_speaker from soni_translate.speech_segmentation import transcribe_speech, align_speech, diarize_speech, ASR_MODEL_OPTIONS, find_whisper_models, diarization_models, COMPUTE_TYPE_CPU, COMPUTE_TYPE_GPU from soni_translate.translate_segments import translate_text, TRANSLATION_PROCESS_OPTIONS from soni_translate.text_to_speech import audio_segmentation_to_voice, edge_tts_voices_list, coqui_xtts_voices_list, piper_tts_voices_list from soni_translate.audio_segments import create_translated_audio, accelerate_segments from soni_translate.language_configuration import LANGUAGES, LANGUAGES_LIST from soni_translate.utils import remove_files, get_link_list, get_valid_files, is_audio_file, is_subtitle_file from soni_translate.text_multiformat_processor import process_subtitles, srt_file_to_segments, break_aling_segments from soni_translate.languages_gui import language_data import hashlib import json import copy from pydub import AudioSegment # Check for API key from Hugging Face Secrets if "GOOGLE_API_KEY" in os.environ: print("✅ Google API Key found in secrets.") else: print("⚠️ Google API Key not found. Please set it in the Space secrets.") if "OPENAI_API_KEY" in os.environ: print("✅ OpenAI API Key found in secrets.") else: print("⚠️ OpenAI API Key not found. Please set it in the Space secrets if you use OpenAI models.") # Create necessary directories directories = ["downloads", "logs", "weights", "clean_song_output", "_XTTS_", "audio", "outputs"] for directory in directories: if not os.path.exists(directory): os.makedirs(directory) class SoniTranslate: def __init__(self): # Device detection moved inside the function for ZeroGPU compatibility self.result_diarize = None self.align_language = None self.result_source_lang = None self.tts_info = self._get_tts_info() def _get_tts_info(self): # Simplified for this example class TTS_Info: def tts_list(self): try: return edge_tts_voices_list() except Exception as e: logger.warning(f"Could not get Edge-TTS voices: {e}") return ["en-US-JennyNeural-Female"] # fallback return TTS_Info() # --- ZeroGPU Decorator --- # duration=300 means 5 minutes max per request. Adjust if needed. @spaces.GPU(duration=300) def multilingual_media_conversion( self, media_file, link_media, directory_input, origin_language, target_language, tts_voice, transcriber_model, max_speakers, is_gui=True, progress=gr.Progress(), ): # Check device inside the GPU decorated function self.device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Working on device: {self.device}") try: progress(0.05, desc="Starting process...") # 1. Handle Input input_media = None if media_file is not None: input_media = media_file.name elif link_media: input_media = link_media elif directory_input and os.path.exists(directory_input): input_media = directory_input if not input_media: raise ValueError("No input media specified. Please upload a file or provide a URL.") base_audio_wav = "audio.wav" base_video_file = "video.mp4" remove_files(base_audio_wav, base_video_file) progress(0.1, desc="Processing input media...") if is_audio_file(input_media): audio_preprocessor(False, input_media, base_audio_wav) else: audio_video_preprocessor(False, input_media, base_video_file, base_audio_wav) # 2. Transcription progress(0.25, desc="Transcribing audio with WhisperX...") source_lang_code = LANGUAGES[origin_language] if origin_language != "Automatic detection" else None # Force float16 if cuda is available (ZeroGPU) compute_type = "float16" if self.device == "cuda" else "int8" audio, result = transcribe_speech( base_audio_wav, transcriber_model, compute_type, 16, source_lang_code ) progress(0.4, desc="Aligning transcription...") self.align_language = result["language"] result = align_speech(audio, result) # 3. Diarization progress(0.5, desc="Separating speakers...") hf_token = os.environ.get("HF_TOKEN") if not hf_token: logger.warning("Hugging Face token not found. Diarization might fail.") self.result_diarize = diarize_speech( base_audio_wav, result, 1, max_speakers, hf_token, diarization_models["pyannote_3.1"] ) self.result_source_lang = copy.deepcopy(self.result_diarize) # 4. Translation progress(0.6, desc="Translating text...") translate_to_code = LANGUAGES[target_language] self.result_diarize["segments"] = translate_text( self.result_diarize["segments"], translate_to_code, "google_translator_batch", chunk_size=1800, source=self.align_language, ) # 5. Text-to-Speech progress(0.75, desc="Generating dubbed audio...") valid_speakers = audio_segmentation_to_voice( self.result_diarize, translate_to_code, is_gui, tts_voice ) # 6. Audio Processing & Merging progress(0.85, desc="Synchronizing and mixing audio...") dub_audio_file = "audio_dub_solo.ogg" remove_files(dub_audio_file) audio_files, _ = accelerate_segments(self.result_diarize, 1.8, valid_speakers) create_translated_audio(self.result_diarize, audio_files, dub_audio_file, False, False) mix_audio_file = "audio_mix.mp3" remove_files(mix_audio_file) # Using os.system which relies on the PATH set at the top command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume=0.1[a];[1:0]volume=1.5[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}' os.system(command_volume_mix) # 7. Final Video Creation progress(0.95, desc="Creating final video...") output_filename = "video_dub.mp4" remove_files(output_filename) if os.path.exists(base_video_file): os.system(f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {output_filename}") final_output = media_out(input_media, translate_to_code, "", "mp4", file_obj=output_filename) else: final_output = media_out(input_media, translate_to_code, "", "mp3", file_obj=mix_audio_file) progress(1.0, desc="Done!") return final_output except Exception as e: logger.error(f"An error occurred: {e}") gr.Error(f"An error occurred: {e}") return None # Instantiate the class SoniTr = SoniTranslate() # Create Gradio Interface with gr.Blocks(theme="Taithrah/Minimal") as app: gr.Markdown("