''' pip install protobuf==3.20.0 pip uninstall torch torchvision torchaudio pip install -U torch torchvision torchaudio ''' ''' import os from gradio_client import Client, handle_file from shutil import copy2 # 初始化Gradio客户端 client = Client("http://localhost:7860/") # 请替换为您准备好的参考音频文件路径 reference_audio = '马正阳英文念白_vocals.wav' # 莎士比亚名言列表(英文原文),这里只是示例,您可以自行增删 shakespeare_quotes = [ "To be, or not to be: that is the question.", "All the world's a stage, and all the men and women merely players.", "There is nothing either good or bad, but thinking makes it so.", "The course of true love never did run smooth.", "Love is blind and lovers cannot see the pretty follies that themselves commit.", "All that glisters is not gold.", "Brevity is the soul of wit.", "What's in a name? That which we call a rose by any other word would smell as sweet.", "Sweet are the uses of adversity.", "Cowards die many times before their deaths; The valiant never taste of death but once." ] # 输出目录,确保此目录存在 output_dir = 'MaZhengYang_IndexTTS2_Shakespeare_Audio' os.makedirs(output_dir, exist_ok=True) for index, quote_text in enumerate(shakespeare_quotes): try: print(f"Processing {index+1}/{len(shakespeare_quotes)}: {quote_text[:50]}...") # 调用Gradio API进行语音合成 result = client.predict( emo_control_method="Same as the voice reference", prompt=handle_file(reference_audio), # 使用您提供的参考音频 text=quote_text, # 填入当前要合成的名言英文原文 emo_ref_path=None, emo_weight=0.8, vec1=0, vec2=0, vec3=0, vec4=0, vec5=0, vec6=0, vec7=0, vec8=0, emo_text="", emo_random=False, max_text_tokens_per_sentence=120, param_16=True, param_17=0.8, param_18=30, param_19=0.8, param_20=0, param_21=3, param_22=10, param_23=1500, api_name="/gen_single" ) # 假设返回的result是一个字典,且音频文件路径在result["value"]中 generated_audio_path = result["value"] # 生成序列号,例如 000001, 000002, ... sequence_number = str(index + 1).zfill(6) new_audio_filename = f"{sequence_number}.wav" new_audio_path = os.path.join(output_dir, new_audio_filename) # 复制并重命名音频文件 copy2(generated_audio_path, new_audio_path) print(f"Audio saved to: {new_audio_path}") # 创建同名的.txt文件并写入英文名言 txt_filename = f"{sequence_number}.txt" txt_path = os.path.join(output_dir, txt_filename) with open(txt_path, 'w', encoding='utf-8') as f: f.write(quote_text) print(f"Text file saved to: {txt_path}") except Exception as e: print(f"Error processing quote '{quote_text}': {e}") print("All processing completed!") ''' import json import logging import spaces import os import sys import threading import time import warnings import pandas as pd warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(current_dir) sys.path.append(os.path.join(current_dir, "indextts")) import argparse parser = argparse.ArgumentParser(description="IndexTTS WebUI") parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose mode") parser.add_argument("--port", type=int, default=7860, help="Port to run the web UI on") parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the web UI on") parser.add_argument("--model_dir", type=str, default="checkpoints", help="Model checkpoints directory") parser.add_argument("--is_fp16", action="store_true", default=False, help="Fp16 infer") cmd_args = parser.parse_args() from tools.download_files import download_model_from_huggingface download_model_from_huggingface(os.path.join(current_dir,"checkpoints"), os.path.join(current_dir, "checkpoints","hf_cache")) import gradio as gr from indextts import infer from indextts.infer_v2 import IndexTTS2 from tools.i18n.i18n import I18nAuto from modelscope.hub import api i18n = I18nAuto(language="Auto") MODE = 'local' tts = IndexTTS2(model_dir=cmd_args.model_dir, cfg_path=os.path.join(cmd_args.model_dir, "config.yaml"), is_fp16=False,use_cuda_kernel=False) # 支持的语言列表 LANGUAGES = { "中文": "zh_CN", "English": "en_US" } EMO_CHOICES = [i18n("与音色参考音频相同"), i18n("使用情感参考音频"), i18n("使用情感向量控制"), i18n("使用情感描述文本控制")] os.makedirs("outputs/tasks",exist_ok=True) os.makedirs("prompts",exist_ok=True) MAX_LENGTH_TO_USE_SPEED = 70 with open("examples/cases.jsonl", "r", encoding="utf-8") as f: example_cases = [] for line in f: line = line.strip() if not line: continue example = json.loads(line) if example.get("emo_audio",None): emo_audio_path = os.path.join("examples",example["emo_audio"]) else: emo_audio_path = None example_cases.append([os.path.join("examples", example.get("prompt_audio", "sample_prompt.wav")), EMO_CHOICES[example.get("emo_mode",0)], example.get("text"), emo_audio_path, example.get("emo_weight",1.0), example.get("emo_text",""), example.get("emo_vec_1",0), example.get("emo_vec_2",0), example.get("emo_vec_3",0), example.get("emo_vec_4",0), example.get("emo_vec_5",0), example.get("emo_vec_6",0), example.get("emo_vec_7",0), example.get("emo_vec_8",0)] ) @spaces.GPU def gen_single(emo_control_method,prompt, text, emo_ref_path, emo_weight, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, emo_text,emo_random, max_text_tokens_per_sentence=120, *args, progress=gr.Progress()): output_path = None if not output_path: output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav") # set gradio progress tts.gr_progress = progress do_sample, top_p, top_k, temperature, \ length_penalty, num_beams, repetition_penalty, max_mel_tokens = args kwargs = { "do_sample": bool(do_sample), "top_p": float(top_p), "top_k": int(top_k) if int(top_k) > 0 else None, "temperature": float(temperature), "length_penalty": float(length_penalty), "num_beams": num_beams, "repetition_penalty": float(repetition_penalty), "max_mel_tokens": int(max_mel_tokens), # "typical_sampling": bool(typical_sampling), # "typical_mass": float(typical_mass), } if type(emo_control_method) is not int: emo_control_method = emo_control_method.value if emo_control_method == 0: emo_ref_path = None emo_weight = 1.0 if emo_control_method == 1: emo_weight = emo_weight if emo_control_method == 2: vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8] vec_sum = sum([vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]) if vec_sum > 1.5: gr.Warning(i18n("情感向量之和不能超过1.5,请调整后重试。")) return else: vec = None print(f"Emo control mode:{emo_control_method},vec:{vec}") output = tts.infer(spk_audio_prompt=prompt, text=text, output_path=output_path, emo_audio_prompt=emo_ref_path, emo_alpha=emo_weight, emo_vector=vec, use_emo_text=(emo_control_method==3), emo_text=emo_text,use_random=emo_random, verbose=cmd_args.verbose, max_text_tokens_per_sentence=int(max_text_tokens_per_sentence), **kwargs) return gr.update(value=output,visible=True) def update_prompt_audio(): update_button = gr.update(interactive=True) return update_button with gr.Blocks(title="IndexTTS Demo") as demo: mutex = threading.Lock() gr.HTML('''