MichonGoddijn231849 commited on
Commit
86bb0f2
Β·
1 Parent(s): 387a013

coqui xtts instead of kokoro + piper

Browse files
Dockerfile CHANGED
@@ -1,7 +1,8 @@
1
- # Hugging Face Spaces - MrrrMe with HYBRID TTS (Kokoro EN + Piper NL)
2
  FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
 
4
  # Install system dependencies
 
5
  RUN apt-get update && apt-get install -y \
6
  bash \
7
  git \
@@ -14,17 +15,12 @@ RUN apt-get update && apt-get install -y \
14
  python3.11-dev \
15
  libgl1-mesa-glx \
16
  libglib2.0-0 \
17
- libsm6 \
18
- libxext6 \
19
  ffmpeg \
20
  portaudio19-dev \
21
  libsndfile1 \
 
22
  nginx \
23
  gnupg \
24
- htop \
25
- vim \
26
- nano \
27
- unzip \
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
  # Install Node.js 20
@@ -32,22 +28,6 @@ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
32
  apt-get install -y nodejs && \
33
  rm -rf /var/lib/apt/lists/*
34
 
35
- # Install Piper TTS for Dutch
36
- RUN wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz -O /tmp/piper.tar.gz && \
37
- tar -xzf /tmp/piper.tar.gz -C /opt && \
38
- ln -s /opt/piper/piper /usr/local/bin/piper && \
39
- chmod +x /opt/piper/piper && \
40
- rm /tmp/piper.tar.gz
41
-
42
- # Download Dutch Piper voices
43
- RUN mkdir -p /opt/piper/voices && cd /opt/piper/voices && \
44
- wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx && \
45
- wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx.json && \
46
- wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_BE/rdh/medium/nl_BE-rdh-medium.onnx && \
47
- wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_BE/rdh/medium/nl_BE-rdh-medium.onnx.json
48
-
49
- ENV PIPER_VOICES_DIR=/opt/piper/voices
50
-
51
  # Set Python 3.11 as default
52
  RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
53
  update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
@@ -61,33 +41,21 @@ RUN python3.11 -m pip install --no-cache-dir \
61
  torchaudio==2.4.0 \
62
  --index-url https://download.pytorch.org/whl/cu118
63
 
64
- # Create constraints file
65
- RUN echo "torch==2.4.0" > /tmp/constraints.txt && \
66
- echo "torchvision==0.19.0" >> /tmp/constraints.txt && \
67
- echo "torchaudio==2.4.0" >> /tmp/constraints.txt && \
68
- echo "httpx<0.28.0" >> /tmp/constraints.txt
69
 
70
  # Install Python dependencies
71
  COPY requirements_docker.txt ./
72
- RUN PIP_CONSTRAINT=/tmp/constraints.txt python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
73
-
74
- # Install Kokoro for English
75
- RUN python3.11 -m pip install --no-cache-dir kokoro-onnx==0.4.9
76
-
77
- # Download Kokoro models for English
78
- RUN python3.11 -c "\
79
- import os; \
80
- import urllib.request; \
81
- os.makedirs('/app/kokoro_models', exist_ok=True); \
82
- print('Downloading Kokoro models...'); \
83
- urllib.request.urlretrieve('https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx', '/app/kokoro_models/kokoro-v0_19.onnx'); \
84
- urllib.request.urlretrieve('https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin', '/app/kokoro_models/voices.bin'); \
85
- print('Kokoro ready'); \
86
- "
87
-
88
- # Install avatar dependencies
89
  RUN python3.11 -m pip install --no-cache-dir \
90
- fastapi uvicorn python-multipart pydub websockets onnxruntime piper-tts
91
 
92
  # Copy application code
93
  COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
@@ -98,7 +66,7 @@ COPY --link --chown=1000:1000 weights/ ./weights/
98
  # Create directories
99
  RUN mkdir -p /app/weights /app/avatar/static
100
 
101
- # Fix openface bug
102
  RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n open(fp,'w').write('import cv2\\\\n'+c)\\n print('Patched')\")"
103
 
104
  # Build frontend
@@ -127,12 +95,19 @@ RUN mkdir -p /etc/nginx/certs && \
127
  -subj "/CN=mrrrme.hf.space"
128
 
129
  # Create startup script
130
- RUN printf '#!/bin/bash\nset -e\nexport HOME=/tmp\nmkdir -p /tmp\ngit config --global user.name "michon" 2>/dev/null || true\ngit config --global user.email "[email protected]" 2>/dev/null || true\nif [ -d "/data" ] && [ -w "/data" ]; then\n echo "Persistent storage: /data"\n chmod 777 /data 2>/dev/null || true\nelse\n echo "Ephemeral storage: /tmp"\nfi\nexport PIPER_VOICES_DIR=/opt/piper/voices\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\nsleep 2\necho "Starting MrrrMe..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
131
 
132
  # Set ownership
133
  RUN chown -R 1000:1000 /app
134
 
 
 
 
 
 
 
135
  USER 1000
 
136
 
137
  EXPOSE 7860
138
 
 
1
+ # Hugging Face Spaces - MrrrMe with Coqui XTTS v2
2
  FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
 
4
  # Install system dependencies
5
+ # espeak-ng is often required for Coqui TTS text processing
6
  RUN apt-get update && apt-get install -y \
7
  bash \
8
  git \
 
15
  python3.11-dev \
16
  libgl1-mesa-glx \
17
  libglib2.0-0 \
 
 
18
  ffmpeg \
19
  portaudio19-dev \
20
  libsndfile1 \
21
+ espeak-ng \
22
  nginx \
23
  gnupg \
 
 
 
 
24
  && rm -rf /var/lib/apt/lists/*
25
 
26
  # Install Node.js 20
 
28
  apt-get install -y nodejs && \
29
  rm -rf /var/lib/apt/lists/*
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # Set Python 3.11 as default
32
  RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
33
  update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
 
41
  torchaudio==2.4.0 \
42
  --index-url https://download.pytorch.org/whl/cu118
43
 
44
+ # Agree to Coqui Terms (Required for XTTS)
45
+ ENV COQUI_TOS_AGREED=1
 
 
 
46
 
47
  # Install Python dependencies
48
  COPY requirements_docker.txt ./
49
+ RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
50
+
51
+ # PRE-DOWNLOAD XTTS V2 MODEL
52
+ # This prevents downloading it at runtime. We run a small script to trigger the download.
53
+ # The model will be stored in /root/.local/share/tts/ by default.
54
+ RUN python3.11 -c "from TTS.api import TTS; print('⏳ Downloading XTTS v2 model...'); TTS('tts_models/multilingual/multi-dataset/xtts_v2'); print('βœ… Download complete.')"
55
+
56
+ # Install avatar dependencies (FastAPI, etc)
 
 
 
 
 
 
 
 
 
57
  RUN python3.11 -m pip install --no-cache-dir \
58
+ fastapi uvicorn python-multipart pydub websockets onnxruntime
59
 
60
  # Copy application code
61
  COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
 
66
  # Create directories
67
  RUN mkdir -p /app/weights /app/avatar/static
68
 
69
+ # Fix openface bug (legacy)
70
  RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n open(fp,'w').write('import cv2\\\\n'+c)\\n print('Patched')\")"
71
 
72
  # Build frontend
 
95
  -subj "/CN=mrrrme.hf.space"
96
 
97
  # Create startup script
98
+ RUN printf '#!/bin/bash\nset -e\nexport HOME=/root\nmkdir -p /tmp\n\n# Agree to TOS at runtime as well just in case\nexport COQUI_TOS_AGREED=1\n\nif [ -d "/data" ] && [ -w "/data" ]; then\n echo "Persistent storage: /data"\n chmod 777 /data 2>/dev/null || true\nelse\n echo "Ephemeral storage: /tmp"\nfi\n\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\necho "Starting MrrrMe (XTTS v2 Enabled)..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
99
 
100
  # Set ownership
101
  RUN chown -R 1000:1000 /app
102
 
103
+ # Note: Coqui TTS downloads models to user home.
104
+ # Since we downloaded as root in build, we need to ensure permissions or run as root.
105
+ # For simplicity in Spaces, we might run as root or ensure HOME env var is correct.
106
+ # Here we stick to user 1000 but we must copy the models.
107
+ RUN cp -r /root/.local /home/user/.local || true && chown -R 1000:1000 /home/user
108
+
109
  USER 1000
110
+ ENV HOME=/home/user
111
 
112
  EXPOSE 7860
113
 
avatar/speak_server.py CHANGED
@@ -1,10 +1,8 @@
1
- """Avatar Backend - HYBRID TTS (Kokoro for EN, Piper for NL)"""
2
  import os
3
- import json
4
  import uuid
5
  import time
6
  import wave
7
- import subprocess
8
  from fastapi import FastAPI, Form, WebSocket
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.staticfiles import StaticFiles
@@ -13,52 +11,41 @@ from pydub import AudioSegment
13
  from typing import List
14
  from dotenv import load_dotenv
15
  import torch
16
- import scipy.io.wavfile as wavfile
17
- import numpy as np
18
- from kokoro_onnx import Kokoro
19
 
20
  load_dotenv()
21
 
22
  OUT_DIR = "/tmp/avatar_static"
23
  os.makedirs(OUT_DIR, exist_ok=True)
24
 
25
- # HYBRID Voice Mapping
26
- KOKORO_VOICES = {
27
- "female": "af_sarah",
28
- "male": "am_adam"
 
29
  }
30
 
31
- PIPER_VOICES = {
32
- "female": "nl_NL-mls-medium",
33
- "male": "nl_BE-rdh-medium"
34
- }
35
-
36
- PIPER_PATH = "piper"
37
-
38
  app = FastAPI()
39
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
40
  app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
41
 
42
  active_connections: List[WebSocket] = []
43
 
44
- # Initialize Kokoro for English
45
- print("[TTS] πŸš€ Initializing Kokoro (English)...")
46
- kokoro_model = Kokoro(
47
- model_path="/app/kokoro_models/kokoro-v0_19.onnx",
48
- voices_path="/app/kokoro_models/voices.bin"
49
- )
50
- print("[TTS] βœ… Kokoro ready")
51
 
52
- # Check Piper for Dutch
53
- print("[TTS] πŸ‡³πŸ‡± Checking Piper (Dutch)...")
54
  try:
55
- result = subprocess.run([PIPER_PATH, "--version"], capture_output=True, text=True, timeout=5)
56
- print(f"[TTS] βœ… Piper ready: {result.stdout.strip()}")
57
- except:
58
- print(f"[TTS] ⚠️ Piper not available")
 
 
59
 
60
  def text_to_visemes_simple(text: str, duration: float):
61
- """Generate visemes"""
62
  visemes = []
63
  words = text.split()
64
  if not words:
@@ -69,85 +56,28 @@ def text_to_visemes_simple(text: str, duration: float):
69
 
70
  for word in words:
71
  word_lower = word.lower().strip('.,!?')
72
-
73
  for i, char in enumerate(word_lower):
74
  char_time = current_time + (i / len(word_lower)) * time_per_word
 
 
 
 
 
 
 
 
 
75
 
76
- if char in 'aΓ‘':
77
- visemes.append({"t": round(char_time, 3), "blend": {"jawOpen": 0.6}})
78
- elif char in 'eΓ©':
79
- visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.4, "jawOpen": 0.2}})
80
- elif char in 'iΓ­':
81
- visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.5, "jawOpen": 0.1}})
82
- elif char in 'oΓ³':
83
- visemes.append({"t": round(char_time, 3), "blend": {"mouthFunnel": 0.6, "jawOpen": 0.3}})
84
- elif char in 'uΓΊ':
85
- visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.1}})
86
- elif char in 'fv':
87
- visemes.append({"t": round(char_time, 3), "blend": {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}})
88
- elif char in 'mpb':
89
- visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.4}})
90
- elif char in 'w':
91
- visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.2}})
92
-
93
  current_time += time_per_word
94
 
95
  return visemes
96
 
97
- def generate_kokoro_tts(text: str, wav_path: str, voice_name: str):
98
- """Generate English audio using Kokoro"""
99
- try:
100
- print(f"[TTS] πŸ”§ Kokoro generating ({voice_name})...")
101
- start = time.time()
102
-
103
- audio, sample_rate = kokoro_model.create(text, voice=voice_name, speed=1.0)
104
- audio_int16 = (audio * 32767).astype(np.int16)
105
- wavfile.write(wav_path, sample_rate, audio_int16)
106
-
107
- print(f"[TTS] βœ… Kokoro done in {time.time()-start:.2f}s")
108
- return True
109
-
110
- except Exception as e:
111
- print(f"[TTS] ❌ Kokoro error: {e}")
112
- return False
113
-
114
- def generate_piper_tts(text: str, wav_path: str, voice_name: str):
115
- """Generate Dutch audio using Piper"""
116
- try:
117
- print(f"[TTS] πŸ”§ Piper generating ({voice_name})...")
118
- start = time.time()
119
-
120
- voices_dir = os.environ.get('PIPER_VOICES_DIR', '/opt/piper/voices')
121
-
122
- process = subprocess.Popen(
123
- [PIPER_PATH, "--model", voice_name, "--data-dir", voices_dir, "--output_file", wav_path],
124
- stdin=subprocess.PIPE,
125
- stdout=subprocess.PIPE,
126
- stderr=subprocess.PIPE,
127
- text=True
128
- )
129
-
130
- stdout, stderr = process.communicate(input=text, timeout=30)
131
-
132
- if process.returncode != 0:
133
- print(f"[TTS] ❌ Piper error: {stderr}")
134
- return False
135
-
136
- if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
137
- print(f"[TTS] βœ… Piper done in {time.time()-start:.2f}s")
138
- return True
139
-
140
- return False
141
-
142
- except Exception as e:
143
- print(f"[TTS] ❌ Piper error: {e}")
144
- return False
145
-
146
  @app.websocket("/ws")
147
  async def websocket_endpoint(websocket: WebSocket):
148
  await websocket.accept()
149
  active_connections.append(websocket)
150
- print(f"[WebSocket] βœ… Client connected. Total: {len(active_connections)}")
151
  try:
152
  while True:
153
  await websocket.receive_text()
@@ -172,38 +102,40 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
172
  wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
173
  mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
174
 
 
 
 
175
  print(f"\n{'='*60}")
176
- print(f"[Backend] HYBRID TTS")
177
  print(f"[Backend] Text: '{text}'")
178
- print(f"[Backend] Language: {language}, Voice: {voice}")
179
 
180
  try:
181
- # Choose TTS engine based on language
182
- if language == "nl":
183
- # Use Piper for Dutch (better quality)
184
- selected_voice = PIPER_VOICES.get(voice, PIPER_VOICES["female"])
185
- print(f"[Backend] Using PIPER for Dutch: {selected_voice}")
186
- success = generate_piper_tts(text, wav_path, selected_voice)
187
- else:
188
- # Use Kokoro for English (better quality)
189
- selected_voice = KOKORO_VOICES.get(voice, KOKORO_VOICES["female"])
190
- print(f"[Backend] Using KOKORO for English: {selected_voice}")
191
- success = generate_kokoro_tts(text, wav_path, selected_voice)
192
-
193
- if not success:
194
- raise Exception("TTS generation failed")
195
 
196
  t2 = time.time()
197
- print(f"[Backend] Audio generated in {t2-t_start:.2f}s")
198
 
199
- # Convert to MP3
200
  try:
201
  audio = AudioSegment.from_wav(wav_path)
202
  audio.export(mp3_path, format="mp3", bitrate="128k")
203
  duration_sec = len(audio) / 1000.0
204
  os.remove(wav_path)
205
  audio_file = mp3_path
206
- except:
 
207
  with wave.open(wav_path, 'rb') as wf:
208
  duration_sec = wf.getnframes() / float(wf.getframerate())
209
  audio_file = wav_path
@@ -221,7 +153,7 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
221
  return response_data
222
 
223
  except Exception as e:
224
- error_msg = f"TTS failed: {str(e)}"
225
  print(f"[Backend] ❌ {error_msg}")
226
  return JSONResponse(status_code=500, content={"error": error_msg})
227
 
@@ -229,18 +161,12 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
229
  async def root():
230
  return {
231
  "status": "running",
232
- "tts_engine": "hybrid",
233
- "engines": {
234
- "en": "kokoro-onnx",
235
- "nl": "piper"
236
- },
237
- "voices": {
238
- "en": KOKORO_VOICES,
239
- "nl": PIPER_VOICES
240
- }
241
  }
242
 
243
  if __name__ == "__main__":
244
  import uvicorn
245
- print("πŸš€ Hybrid TTS Server (Kokoro EN + Piper NL)")
246
  uvicorn.run(app, host="0.0.0.0", port=8765)
 
1
+ """Avatar Backend - Coqui XTTS v2 (Multi-lingual)"""
2
  import os
 
3
  import uuid
4
  import time
5
  import wave
 
6
  from fastapi import FastAPI, Form, WebSocket
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from fastapi.staticfiles import StaticFiles
 
11
  from typing import List
12
  from dotenv import load_dotenv
13
  import torch
14
+ from TTS.api import TTS
 
 
15
 
16
  load_dotenv()
17
 
18
  OUT_DIR = "/tmp/avatar_static"
19
  os.makedirs(OUT_DIR, exist_ok=True)
20
 
21
+ # XTTS v2 Standard Speakers
22
+ # These speakers support both English (en) and Dutch (nl) natively
23
+ VOICE_MAP = {
24
+ "female": "Ana Florence",
25
+ "male": "Andrew Chipper"
26
  }
27
 
 
 
 
 
 
 
 
28
  app = FastAPI()
29
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
30
  app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
31
 
32
  active_connections: List[WebSocket] = []
33
 
34
+ # Initialize Coqui XTTS v2
35
+ print("[TTS] πŸš€ Initializing Coqui XTTS v2...")
36
+ device = "cuda" if torch.cuda.is_available() else "cpu"
37
+ print(f"[TTS] πŸ–₯️ Device: {device}")
 
 
 
38
 
 
 
39
  try:
40
+ # This will load the model from the directory defined in Dockerfile or download it
41
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
42
+ print("[TTS] βœ… XTTS v2 model loaded and ready")
43
+ except Exception as e:
44
+ print(f"[TTS] ❌ FATAL: Could not load XTTS model: {e}")
45
+ tts = None
46
 
47
  def text_to_visemes_simple(text: str, duration: float):
48
+ """Generate simple visemes for lip sync (unchanged)"""
49
  visemes = []
50
  words = text.split()
51
  if not words:
 
56
 
57
  for word in words:
58
  word_lower = word.lower().strip('.,!?')
 
59
  for i, char in enumerate(word_lower):
60
  char_time = current_time + (i / len(word_lower)) * time_per_word
61
+ blend = {}
62
+ if char in 'aΓ‘': blend = {"jawOpen": 0.6}
63
+ elif char in 'eΓ©': blend = {"mouthSmile": 0.4, "jawOpen": 0.2}
64
+ elif char in 'iΓ­': blend = {"mouthSmile": 0.5, "jawOpen": 0.1}
65
+ elif char in 'oΓ³': blend = {"mouthFunnel": 0.6, "jawOpen": 0.3}
66
+ elif char in 'uΓΊ': blend = {"mouthPucker": 0.5, "jawOpen": 0.1}
67
+ elif char in 'fv': blend = {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}
68
+ elif char in 'mpb': blend = {"mouthPucker": 0.4}
69
+ elif char in 'w': blend = {"mouthPucker": 0.5, "jawOpen": 0.2}
70
 
71
+ if blend:
72
+ visemes.append({"t": round(char_time, 3), "blend": blend})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  current_time += time_per_word
74
 
75
  return visemes
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  @app.websocket("/ws")
78
  async def websocket_endpoint(websocket: WebSocket):
79
  await websocket.accept()
80
  active_connections.append(websocket)
 
81
  try:
82
  while True:
83
  await websocket.receive_text()
 
102
  wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
103
  mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
104
 
105
+ # Map 'female'/'male' to specific XTTS speaker names
106
+ speaker_name = VOICE_MAP.get(voice, voice) # Fallback to input if it's already a name
107
+
108
  print(f"\n{'='*60}")
109
+ print(f"[Backend] XTTS v2 Generation")
110
  print(f"[Backend] Text: '{text}'")
111
+ print(f"[Backend] Lang: {language} | Speaker: {speaker_name}")
112
 
113
  try:
114
+ if tts is None:
115
+ raise Exception("TTS Model not initialized")
116
+
117
+ # Generate Audio
118
+ # split_sentences=True is generally better for long text in XTTS
119
+ tts.tts_to_file(
120
+ text=text,
121
+ file_path=wav_path,
122
+ speaker=speaker_name,
123
+ language=language,
124
+ split_sentences=True
125
+ )
 
 
126
 
127
  t2 = time.time()
128
+ print(f"[Backend] βœ… Generated in {t2-t_start:.2f}s")
129
 
130
+ # Convert to MP3 for web (smaller size)
131
  try:
132
  audio = AudioSegment.from_wav(wav_path)
133
  audio.export(mp3_path, format="mp3", bitrate="128k")
134
  duration_sec = len(audio) / 1000.0
135
  os.remove(wav_path)
136
  audio_file = mp3_path
137
+ except Exception as e:
138
+ print(f"[Backend] ⚠️ MP3 conversion failed, using WAV: {e}")
139
  with wave.open(wav_path, 'rb') as wf:
140
  duration_sec = wf.getnframes() / float(wf.getframerate())
141
  audio_file = wav_path
 
153
  return response_data
154
 
155
  except Exception as e:
156
+ error_msg = f"XTTS failed: {str(e)}"
157
  print(f"[Backend] ❌ {error_msg}")
158
  return JSONResponse(status_code=500, content={"error": error_msg})
159
 
 
161
  async def root():
162
  return {
163
  "status": "running",
164
+ "tts_engine": "coqui-xtts-v2",
165
+ "languages": ["en", "nl", "fr", "de", "it", "es", "ja", "zh", "pt", "pl", "tr", "ru", "cs", "ar", "hu", "ko"],
166
+ "voices": VOICE_MAP
 
 
 
 
 
 
167
  }
168
 
169
  if __name__ == "__main__":
170
  import uvicorn
171
+ print("πŸš€ Avatar Server (Coqui XTTS v2)")
172
  uvicorn.run(app, host="0.0.0.0", port=8765)
mrrrme/audio/voice_assistant.py CHANGED
@@ -1,4 +1,4 @@
1
- """Text-to-Speech using Kokoro-82M (PROPER INITIALIZATION)"""
2
  import os
3
  import time
4
  import tempfile
@@ -6,28 +6,28 @@ import threading
6
  import pygame
7
  import torch
8
  import numpy as np
9
- import scipy.io.wavfile as wavfile
10
- from pydub import AudioSegment
11
  from dotenv import load_dotenv
12
- from kokoro_onnx import Kokoro
13
 
14
  load_dotenv()
15
 
 
 
16
  VOICE_MAP = {
17
- "female": "af_sarah",
18
- "male": "am_adam",
19
- "Happy": "af_sarah",
20
- "Sad": "af_bella",
21
- "Angry": "am_adam",
22
- "Neutral": "af_sarah",
23
  }
24
 
25
  class VoiceAssistant:
26
- """Kokoro TTS"""
27
 
28
  def __init__(self, voice: str = "female", rate: float = 1.0, language: str = "en"):
29
  self.voice_key = voice
30
- self.voice_name = VOICE_MAP.get(voice, "af_sarah")
31
  self.rate = rate
32
  self.language = language
33
 
@@ -36,18 +36,20 @@ class VoiceAssistant:
36
  self.speaking_lock = threading.Lock()
37
  self.audio_workers = []
38
 
39
- print(f"[TTS] πŸš€ Using Kokoro-82M TTS")
40
 
41
- # Initialize Kokoro with paths (models pre-downloaded in Dockerfile)
 
42
  try:
43
- self.kokoro = Kokoro(
44
- model_path="/app/kokoro_models/kokoro-v0_19.onnx",
45
- voices_path="/app/kokoro_models/voices.bin"
46
- )
47
- print(f"[TTS] βœ… Kokoro ready")
 
48
  except Exception as e:
49
- print(f"[TTS] ⚠️ Kokoro init: {e}")
50
- self.kokoro = None
51
 
52
  print("[TTS] πŸ”§ Initializing pygame...")
53
  try:
@@ -64,22 +66,35 @@ class VoiceAssistant:
64
  print(f"[TTS] βœ… Registered: {worker.__class__.__name__}")
65
 
66
  def set_voice(self, voice_key: str):
 
67
  if voice_key in VOICE_MAP:
68
  self.voice_name = VOICE_MAP[voice_key]
69
  self.voice_key = voice_key
70
  print(f"[TTS] πŸŽ™οΈ Voice β†’ {self.voice_name}")
71
  else:
 
72
  self.voice_name = voice_key
 
73
 
74
  def set_language(self, language: str):
 
75
  self.language = language
76
  print(f"[TTS] 🌍 Language β†’ {language}")
77
 
78
  def set_rate(self, rate: float):
 
 
 
 
79
  self.rate = max(0.5, min(2.0, rate))
80
- print(f"[TTS] 🎚️ Rate β†’ {self.rate}x")
81
 
82
  def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
 
 
 
 
 
83
  if emotion == "Happy":
84
  self.rate = 1.1
85
  elif emotion == "Sad":
@@ -109,26 +124,28 @@ class VoiceAssistant:
109
 
110
  def _get_unique_filename(self, ext: str = ".wav"):
111
  self.counter += 1
112
- return os.path.join(tempfile.gettempdir(), f"kokoro_{self.counter}_{int(time.time() * 1000)}{ext}")
113
 
114
  def _generate_speech(self, text: str, filename: str):
115
- """Generate speech using Kokoro"""
116
  try:
117
- # Initialize on first use if needed
118
- if self.kokoro is None:
119
- print("[TTS] πŸ“₯ Initializing Kokoro...")
120
- self.kokoro = Kokoro(
121
- model_path="/app/kokoro_models/kokoro-v0_19.onnx",
122
- voices_path="/app/kokoro_models/voices.bin"
123
- )
124
 
125
- print(f"[TTS] πŸ”§ Generating...")
126
  start = time.time()
127
 
128
- # create() returns (audio, sample_rate)
129
- audio, sample_rate = self.kokoro.create(text, voice=self.voice_name, speed=self.rate)
130
- audio_int16 = (audio * 32767).astype(np.int16)
131
- wavfile.write(filename, sample_rate, audio_int16)
 
 
 
 
 
 
132
 
133
  gen_time = time.time() - start
134
  print(f"[TTS] βœ… Generated in {gen_time:.2f}s")
@@ -166,6 +183,7 @@ class VoiceAssistant:
166
 
167
  print(f"\n[TTS] πŸ”Š Speaking ({self.language}): '{text[:80]}...'")
168
 
 
169
  for worker in self.audio_workers:
170
  if hasattr(worker, 'pause_listening'):
171
  try:
@@ -195,6 +213,7 @@ class VoiceAssistant:
195
 
196
  time.sleep(0.2)
197
 
 
198
  for worker in self.audio_workers:
199
  if hasattr(worker, 'resume_listening'):
200
  try:
 
1
+ """Text-to-Speech using Coqui XTTS v2 (Multi-lingual)"""
2
  import os
3
  import time
4
  import tempfile
 
6
  import pygame
7
  import torch
8
  import numpy as np
 
 
9
  from dotenv import load_dotenv
10
+ from TTS.api import TTS # Coqui TTS
11
 
12
  load_dotenv()
13
 
14
+ # XTTS v2 Default Speakers
15
+ # These are standard speakers included in the xtts_v2 model.
16
  VOICE_MAP = {
17
+ "female": "Ana Florence",
18
+ "male": "Andrew Chipper",
19
+ "Happy": "Ana Florence",
20
+ "Sad": "Ana Florence",
21
+ "Angry": "Andrew Chipper",
22
+ "Neutral": "Ana Florence",
23
  }
24
 
25
  class VoiceAssistant:
26
+ """Coqui XTTS v2 TTS"""
27
 
28
  def __init__(self, voice: str = "female", rate: float = 1.0, language: str = "en"):
29
  self.voice_key = voice
30
+ self.voice_name = VOICE_MAP.get(voice, "Ana Florence")
31
  self.rate = rate
32
  self.language = language
33
 
 
36
  self.speaking_lock = threading.Lock()
37
  self.audio_workers = []
38
 
39
+ print(f"[TTS] πŸš€ Initializing Coqui XTTS v2...")
40
 
41
+ # Initialize Coqui TTS with XTTS v2 model
42
+ # gpu=True will use CUDA if available
43
  try:
44
+ device = "cuda" if torch.cuda.is_available() else "cpu"
45
+ print(f"[TTS] πŸ“₯ Loading XTTS v2 model on {device} (this may take time on first run)...")
46
+
47
+ self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
48
+
49
+ print(f"[TTS] βœ… XTTS v2 model loaded")
50
  except Exception as e:
51
+ print(f"[TTS] ⚠️ XTTS init error: {e}")
52
+ self.tts = None
53
 
54
  print("[TTS] πŸ”§ Initializing pygame...")
55
  try:
 
66
  print(f"[TTS] βœ… Registered: {worker.__class__.__name__}")
67
 
68
  def set_voice(self, voice_key: str):
69
+ """Switch between male/female voices"""
70
  if voice_key in VOICE_MAP:
71
  self.voice_name = VOICE_MAP[voice_key]
72
  self.voice_key = voice_key
73
  print(f"[TTS] πŸŽ™οΈ Voice β†’ {self.voice_name}")
74
  else:
75
+ # If user passes a raw speaker name that exists in XTTS
76
  self.voice_name = voice_key
77
+ print(f"[TTS] πŸŽ™οΈ Voice β†’ {self.voice_name} (Custom)")
78
 
79
  def set_language(self, language: str):
80
+ """Set language (e.g., 'en', 'nl')"""
81
  self.language = language
82
  print(f"[TTS] 🌍 Language β†’ {language}")
83
 
84
  def set_rate(self, rate: float):
85
+ """
86
+ Note: XTTS v2 does not natively support speed control via API in the same way.
87
+ This is kept for compatibility but might not affect generation speed directly.
88
+ """
89
  self.rate = max(0.5, min(2.0, rate))
90
+ print(f"[TTS] 🎚️ Rate β†’ {self.rate}x (XTTS may ignore this)")
91
 
92
  def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
93
+ """
94
+ Adjusts internal state based on emotion.
95
+ Note: XTTS implies emotion via the input text or style transfer (if enabled).
96
+ For now, we just log it or adjust simple parameters.
97
+ """
98
  if emotion == "Happy":
99
  self.rate = 1.1
100
  elif emotion == "Sad":
 
124
 
125
  def _get_unique_filename(self, ext: str = ".wav"):
126
  self.counter += 1
127
+ return os.path.join(tempfile.gettempdir(), f"xtts_{self.counter}_{int(time.time() * 1000)}{ext}")
128
 
129
  def _generate_speech(self, text: str, filename: str):
130
+ """Generate speech using Coqui XTTS v2"""
131
  try:
132
+ if self.tts is None:
133
+ print("[TTS] ❌ Model not initialized")
134
+ return False
 
 
 
 
135
 
136
+ print(f"[TTS] πŸ”§ Generating with {self.voice_name} ({self.language})...")
137
  start = time.time()
138
 
139
+ # XTTS v2 Generation
140
+ # We use the named speaker directly.
141
+ # Note: XTTS usually does not support 'speed' arg in tts_to_file directly.
142
+ self.tts.tts_to_file(
143
+ text=text,
144
+ file_path=filename,
145
+ speaker=self.voice_name,
146
+ language=self.language,
147
+ split_sentences=True
148
+ )
149
 
150
  gen_time = time.time() - start
151
  print(f"[TTS] βœ… Generated in {gen_time:.2f}s")
 
183
 
184
  print(f"\n[TTS] πŸ”Š Speaking ({self.language}): '{text[:80]}...'")
185
 
186
+ # Pause workers (listening)
187
  for worker in self.audio_workers:
188
  if hasattr(worker, 'pause_listening'):
189
  try:
 
213
 
214
  time.sleep(0.2)
215
 
216
+ # Resume workers
217
  for worker in self.audio_workers:
218
  if hasattr(worker, 'resume_listening'):
219
  try:
requirements_docker.txt CHANGED
@@ -1,4 +1,4 @@
1
- # MrrrMe Backend Requirements - HYBRID TTS (Kokoro EN + Piper NL)
2
  # Core frameworks
3
  fastapi==0.115.4
4
  uvicorn[standard]==0.32.0
@@ -14,7 +14,7 @@ timm==1.0.11
14
  einops==0.8.0
15
 
16
  # ML/DL
17
- # NOTE: torch==2.4.0, torchvision==0.19.0, torchaudio==2.4.0 are installed in Dockerfile
18
  numpy==1.26.4
19
  scipy==1.13.1
20
  pandas==2.2.3
@@ -31,10 +31,10 @@ pydub==0.25.1
31
  sounddevice==0.5.1
32
  webrtcvad==2.0.10
33
 
34
- # Text-to-Speech (Piper for Dutch - installed separately)
35
- # Kokoro for English - installed separately
36
- onnxruntime==1.16.3
37
- pygame==2.6.1
38
 
39
  # Text processing
40
  nltk==3.9.1
 
1
+ # MrrrMe Backend Requirements - Coqui XTTS v2
2
  # Core frameworks
3
  fastapi==0.115.4
4
  uvicorn[standard]==0.32.0
 
14
  einops==0.8.0
15
 
16
  # ML/DL
17
+ # NOTE: torch, torchvision, torchaudio are installed in Dockerfile with CUDA support
18
  numpy==1.26.4
19
  scipy==1.13.1
20
  pandas==2.2.3
 
31
  sounddevice==0.5.1
32
  webrtcvad==2.0.10
33
 
34
+ # Text-to-Speech
35
+ coqui-tts==0.24.2
36
+ # Note: coqui-tts pulls in a lot of deps, but we constrain torch in Dockerfile.
37
+ # If build fails on conflicts, remove the version constraint here.
38
 
39
  # Text processing
40
  nltk==3.9.1