''' pip install protobuf==3.20.0 pip uninstall torch torchvision torchaudio pip install -U torch torchvision torchaudio ''' ''' import os from gradio_client import Client, handle_file from shutil import copy2 # 初始化Gradio客户端 client = Client("http://localhost:7860/") # 请替换为您准备好的参考音频文件路径 reference_audio = '马正阳英文念白_vocals.wav' # 莎士比亚名言列表(英文原文),这里只是示例,您可以自行增删 shakespeare_quotes = [ "To be, or not to be: that is the question.", "All the world's a stage, and all the men and women merely players.", "There is nothing either good or bad, but thinking makes it so.", "The course of true love never did run smooth.", "Love is blind and lovers cannot see the pretty follies that themselves commit.", "All that glisters is not gold.", "Brevity is the soul of wit.", "What's in a name? That which we call a rose by any other word would smell as sweet.", "Sweet are the uses of adversity.", "Cowards die many times before their deaths; The valiant never taste of death but once." ] # 输出目录,确保此目录存在 output_dir = 'MaZhengYang_IndexTTS2_Shakespeare_Audio' os.makedirs(output_dir, exist_ok=True) for index, quote_text in enumerate(shakespeare_quotes): try: print(f"Processing {index+1}/{len(shakespeare_quotes)}: {quote_text[:50]}...") # 调用Gradio API进行语音合成 result = client.predict( emo_control_method="Same as the voice reference", prompt=handle_file(reference_audio), # 使用您提供的参考音频 text=quote_text, # 填入当前要合成的名言英文原文 emo_ref_path=None, emo_weight=0.8, vec1=0, vec2=0, vec3=0, vec4=0, vec5=0, vec6=0, vec7=0, vec8=0, emo_text="", emo_random=False, max_text_tokens_per_sentence=120, param_16=True, param_17=0.8, param_18=30, param_19=0.8, param_20=0, param_21=3, param_22=10, param_23=1500, api_name="/gen_single" ) # 假设返回的result是一个字典,且音频文件路径在result["value"]中 generated_audio_path = result["value"] # 生成序列号,例如 000001, 000002, ... sequence_number = str(index + 1).zfill(6) new_audio_filename = f"{sequence_number}.wav" new_audio_path = os.path.join(output_dir, new_audio_filename) # 复制并重命名音频文件 copy2(generated_audio_path, new_audio_path) print(f"Audio saved to: {new_audio_path}") # 创建同名的.txt文件并写入英文名言 txt_filename = f"{sequence_number}.txt" txt_path = os.path.join(output_dir, txt_filename) with open(txt_path, 'w', encoding='utf-8') as f: f.write(quote_text) print(f"Text file saved to: {txt_path}") except Exception as e: print(f"Error processing quote '{quote_text}': {e}") print("All processing completed!") ''' import json import logging import spaces import os import sys import threading import time import warnings import pandas as pd warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(current_dir) sys.path.append(os.path.join(current_dir, "indextts")) import argparse parser = argparse.ArgumentParser(description="IndexTTS WebUI") parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose mode") parser.add_argument("--port", type=int, default=7860, help="Port to run the web UI on") parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the web UI on") parser.add_argument("--model_dir", type=str, default="checkpoints", help="Model checkpoints directory") parser.add_argument("--is_fp16", action="store_true", default=False, help="Fp16 infer") cmd_args = parser.parse_args() from tools.download_files import download_model_from_huggingface download_model_from_huggingface(os.path.join(current_dir,"checkpoints"), os.path.join(current_dir, "checkpoints","hf_cache")) import gradio as gr from indextts import infer from indextts.infer_v2 import IndexTTS2 from tools.i18n.i18n import I18nAuto from modelscope.hub import api i18n = I18nAuto(language="Auto") MODE = 'local' tts = IndexTTS2(model_dir=cmd_args.model_dir, cfg_path=os.path.join(cmd_args.model_dir, "config.yaml"), is_fp16=False,use_cuda_kernel=False) # 支持的语言列表 LANGUAGES = { "中文": "zh_CN", "English": "en_US" } EMO_CHOICES = [i18n("与音色参考音频相同"), i18n("使用情感参考音频"), i18n("使用情感向量控制"), i18n("使用情感描述文本控制")] os.makedirs("outputs/tasks",exist_ok=True) os.makedirs("prompts",exist_ok=True) MAX_LENGTH_TO_USE_SPEED = 70 with open("examples/cases.jsonl", "r", encoding="utf-8") as f: example_cases = [] for line in f: line = line.strip() if not line: continue example = json.loads(line) if example.get("emo_audio",None): emo_audio_path = os.path.join("examples",example["emo_audio"]) else: emo_audio_path = None example_cases.append([os.path.join("examples", example.get("prompt_audio", "sample_prompt.wav")), EMO_CHOICES[example.get("emo_mode",0)], example.get("text"), emo_audio_path, example.get("emo_weight",1.0), example.get("emo_text",""), example.get("emo_vec_1",0), example.get("emo_vec_2",0), example.get("emo_vec_3",0), example.get("emo_vec_4",0), example.get("emo_vec_5",0), example.get("emo_vec_6",0), example.get("emo_vec_7",0), example.get("emo_vec_8",0)] ) @spaces.GPU def gen_single(emo_control_method,prompt, text, emo_ref_path, emo_weight, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, emo_text,emo_random, max_text_tokens_per_sentence=120, *args, progress=gr.Progress()): output_path = None if not output_path: output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav") # set gradio progress tts.gr_progress = progress do_sample, top_p, top_k, temperature, \ length_penalty, num_beams, repetition_penalty, max_mel_tokens = args kwargs = { "do_sample": bool(do_sample), "top_p": float(top_p), "top_k": int(top_k) if int(top_k) > 0 else None, "temperature": float(temperature), "length_penalty": float(length_penalty), "num_beams": num_beams, "repetition_penalty": float(repetition_penalty), "max_mel_tokens": int(max_mel_tokens), # "typical_sampling": bool(typical_sampling), # "typical_mass": float(typical_mass), } if type(emo_control_method) is not int: emo_control_method = emo_control_method.value if emo_control_method == 0: emo_ref_path = None emo_weight = 1.0 if emo_control_method == 1: emo_weight = emo_weight if emo_control_method == 2: vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8] vec_sum = sum([vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]) if vec_sum > 1.5: gr.Warning(i18n("情感向量之和不能超过1.5,请调整后重试。")) return else: vec = None print(f"Emo control mode:{emo_control_method},vec:{vec}") output = tts.infer(spk_audio_prompt=prompt, text=text, output_path=output_path, emo_audio_prompt=emo_ref_path, emo_alpha=emo_weight, emo_vector=vec, use_emo_text=(emo_control_method==3), emo_text=emo_text,use_random=emo_random, verbose=cmd_args.verbose, max_text_tokens_per_sentence=int(max_text_tokens_per_sentence), **kwargs) return gr.update(value=output,visible=True) def update_prompt_audio(): update_button = gr.update(interactive=True) return update_button with gr.Blocks(title="IndexTTS Demo") as demo: mutex = threading.Lock() gr.HTML('''

IndexTTS2: A Breakthrough in Emotionally Expressive and Duration-Controlled Auto-Regressive Zero-Shot Text-to-Speech

''') with gr.Tab(i18n("音频生成")): with gr.Row(): os.makedirs("prompts",exist_ok=True) prompt_audio = gr.Audio(label=i18n("音色参考音频"),key="prompt_audio", sources=["upload","microphone"],type="filepath") prompt_list = os.listdir("prompts") default = '' if prompt_list: default = prompt_list[0] with gr.Column(): input_text_single = gr.TextArea(label=i18n("文本"),key="input_text_single", placeholder=i18n("请输入目标文本"), info=f"{i18n('当前模型版本')}{tts.model_version or '1.0'}") gen_button = gr.Button(i18n("生成语音"), key="gen_button",interactive=True) output_audio = gr.Audio(label=i18n("生成结果"), visible=True,key="output_audio") with gr.Accordion(i18n("功能设置")): # 情感控制选项部分 with gr.Row(): emo_control_method = gr.Radio( choices=EMO_CHOICES, type="index", value=EMO_CHOICES[0],label=i18n("情感控制方式")) # 情感参考音频部分 with gr.Group(visible=False) as emotion_reference_group: with gr.Row(): emo_upload = gr.Audio(label=i18n("上传情感参考音频"), type="filepath") with gr.Row(): emo_weight = gr.Slider(label=i18n("情感权重"), minimum=0.0, maximum=1.6, value=0.8, step=0.01) # 情感随机采样 with gr.Row(): emo_random = gr.Checkbox(label=i18n("情感随机采样"),value=False,visible=False) # 情感向量控制部分 with gr.Group(visible=False) as emotion_vector_group: with gr.Row(): with gr.Column(): vec1 = gr.Slider(label=i18n("喜"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) vec2 = gr.Slider(label=i18n("怒"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) vec3 = gr.Slider(label=i18n("哀"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) vec4 = gr.Slider(label=i18n("惧"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) with gr.Column(): vec5 = gr.Slider(label=i18n("厌恶"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) vec6 = gr.Slider(label=i18n("低落"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) vec7 = gr.Slider(label=i18n("惊喜"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) vec8 = gr.Slider(label=i18n("平静"), minimum=0.0, maximum=1.4, value=0.0, step=0.05) with gr.Group(visible=False) as emo_text_group: with gr.Row(): emo_text = gr.Textbox(label=i18n("情感描述文本"), placeholder=i18n("请输入情感描述文本"), value="", info=i18n("例如:高兴,愤怒,悲伤等")) with gr.Accordion(i18n("高级生成参数设置"), open=False): with gr.Row(): with gr.Column(scale=1): gr.Markdown(f"**{i18n('GPT2 采样设置')}** _{i18n('参数会影响音频多样性和生成速度详见')}[Generation strategies](https://huggingface.co/docs/transformers/main/en/generation_strategies)_") with gr.Row(): do_sample = gr.Checkbox(label="do_sample", value=True, info="是否进行采样") temperature = gr.Slider(label="temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1) with gr.Row(): top_p = gr.Slider(label="top_p", minimum=0.0, maximum=1.0, value=0.8, step=0.01) top_k = gr.Slider(label="top_k", minimum=0, maximum=100, value=30, step=1) num_beams = gr.Slider(label="num_beams", value=3, minimum=1, maximum=10, step=1) with gr.Row(): repetition_penalty = gr.Number(label="repetition_penalty", precision=None, value=10.0, minimum=0.1, maximum=20.0, step=0.1) length_penalty = gr.Number(label="length_penalty", precision=None, value=0.0, minimum=-2.0, maximum=2.0, step=0.1) max_mel_tokens = gr.Slider(label="max_mel_tokens", value=1500, minimum=50, maximum=tts.cfg.gpt.max_mel_tokens, step=10, info="生成Token最大数量,过小导致音频被截断", key="max_mel_tokens") # with gr.Row(): # typical_sampling = gr.Checkbox(label="typical_sampling", value=False, info="不建议使用") # typical_mass = gr.Slider(label="typical_mass", value=0.9, minimum=0.0, maximum=1.0, step=0.1) with gr.Column(scale=2): gr.Markdown(f'**{i18n("分句设置")}** _{i18n("参数会影响音频质量和生成速度")}_') with gr.Row(): max_text_tokens_per_sentence = gr.Slider( label=i18n("分句最大Token数"), value=120, minimum=20, maximum=tts.cfg.gpt.max_text_tokens, step=2, key="max_text_tokens_per_sentence", info=i18n("建议80~200之间,值越大,分句越长;值越小,分句越碎;过小过大都可能导致音频质量不高"), ) with gr.Accordion(i18n("预览分句结果"), open=True) as sentences_settings: sentences_preview = gr.Dataframe( headers=[i18n("序号"), i18n("分句内容"), i18n("Token数")], key="sentences_preview", wrap=True, ) advanced_params = [ do_sample, top_p, top_k, temperature, length_penalty, num_beams, repetition_penalty, max_mel_tokens, # typical_sampling, typical_mass, ] if len(example_cases) > 0: gr.Examples( examples=example_cases, examples_per_page=20, inputs=[prompt_audio, emo_control_method, input_text_single, emo_upload, emo_weight, emo_text, vec1,vec2,vec3,vec4,vec5,vec6,vec7,vec8] ) def on_input_text_change(text, max_tokens_per_sentence): if text and len(text) > 0: text_tokens_list = tts.tokenizer.tokenize(text) sentences = tts.tokenizer.split_sentences(text_tokens_list, max_tokens_per_sentence=int(max_tokens_per_sentence)) data = [] for i, s in enumerate(sentences): sentence_str = ''.join(s) tokens_count = len(s) data.append([i, sentence_str, tokens_count]) return { sentences_preview: gr.update(value=data, visible=True, type="array"), } else: df = pd.DataFrame([], columns=[i18n("序号"), i18n("分句内容"), i18n("Token数")]) return { sentences_preview: gr.update(value=df), } def on_method_select(emo_control_method): if emo_control_method == 1: return (gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) elif emo_control_method == 2: return (gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(visible=False) ) elif emo_control_method == 3: return (gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True) ) else: return (gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) emo_control_method.select(on_method_select, inputs=[emo_control_method], outputs=[emotion_reference_group, emo_random, emotion_vector_group, emo_text_group] ) input_text_single.change( on_input_text_change, inputs=[input_text_single, max_text_tokens_per_sentence], outputs=[sentences_preview] ) max_text_tokens_per_sentence.change( on_input_text_change, inputs=[input_text_single, max_text_tokens_per_sentence], outputs=[sentences_preview] ) prompt_audio.upload(update_prompt_audio, inputs=[], outputs=[gen_button]) gen_button.click(gen_single, inputs=[emo_control_method,prompt_audio, input_text_single, emo_upload, emo_weight, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, emo_text,emo_random, max_text_tokens_per_sentence, *advanced_params, ], outputs=[output_audio]) if __name__ == "__main__": demo.queue(20) demo.launch(server_name="0.0.0.0", server_port=7860, share = True)