Spaces:

michon
/

mrrrme-emotion-ai

Sleeping

App Files Files Community

MichonGoddijn231849 commited on 12 days ago

Commit

86bb0f2

1 Parent(s): 387a013

coqui xtts instead of kokoro + piper

Browse files

Files changed (4) hide show

Dockerfile +23 -48
avatar/speak_server.py +55 -129
mrrrme/audio/voice_assistant.py +55 -36
requirements_docker.txt +6 -6

Dockerfile CHANGED Viewed

@@ -1,7 +1,8 @@
-# Hugging Face Spaces - MrrrMe with HYBRID TTS (Kokoro EN + Piper NL)
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     bash \
     git \
@@ -14,17 +15,12 @@ RUN apt-get update && apt-get install -y \
     python3.11-dev \
     libgl1-mesa-glx \
     libglib2.0-0 \
-    libsm6 \
-    libxext6 \
     ffmpeg \
     portaudio19-dev \
     libsndfile1 \
     nginx \
     gnupg \
-    htop \
-    vim \
-    nano \
-    unzip \
     && rm -rf /var/lib/apt/lists/*
 # Install Node.js 20
@@ -32,22 +28,6 @@ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
     apt-get install -y nodejs && \
     rm -rf /var/lib/apt/lists/*
-# Install Piper TTS for Dutch
-RUN wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz -O /tmp/piper.tar.gz && \
-    tar -xzf /tmp/piper.tar.gz -C /opt && \
-    ln -s /opt/piper/piper /usr/local/bin/piper && \
-    chmod +x /opt/piper/piper && \
-    rm /tmp/piper.tar.gz
-# Download Dutch Piper voices
-RUN mkdir -p /opt/piper/voices && cd /opt/piper/voices && \
-    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx && \
-    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx.json && \
-    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_BE/rdh/medium/nl_BE-rdh-medium.onnx && \
-    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_BE/rdh/medium/nl_BE-rdh-medium.onnx.json
-ENV PIPER_VOICES_DIR=/opt/piper/voices
 # Set Python 3.11 as default
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
@@ -61,33 +41,21 @@ RUN python3.11 -m pip install --no-cache-dir \
     torchaudio==2.4.0 \
     --index-url https://download.pytorch.org/whl/cu118
-# Create constraints file
-RUN echo "torch==2.4.0" > /tmp/constraints.txt && \
-    echo "torchvision==0.19.0" >> /tmp/constraints.txt && \
-    echo "torchaudio==2.4.0" >> /tmp/constraints.txt && \
-    echo "httpx<0.28.0" >> /tmp/constraints.txt
 # Install Python dependencies
 COPY requirements_docker.txt ./
-RUN PIP_CONSTRAINT=/tmp/constraints.txt python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
-# Install Kokoro for English
-RUN python3.11 -m pip install --no-cache-dir kokoro-onnx==0.4.9
-# Download Kokoro models for English
-RUN python3.11 -c "\
-import os; \
-import urllib.request; \
-os.makedirs('/app/kokoro_models', exist_ok=True); \
-print('Downloading Kokoro models...'); \
-urllib.request.urlretrieve('https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx', '/app/kokoro_models/kokoro-v0_19.onnx'); \
-urllib.request.urlretrieve('https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin', '/app/kokoro_models/voices.bin'); \
-print('Kokoro ready'); \
-"
-# Install avatar dependencies
 RUN python3.11 -m pip install --no-cache-dir \
-    fastapi uvicorn python-multipart pydub websockets onnxruntime piper-tts
 # Copy application code
 COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
@@ -98,7 +66,7 @@ COPY --link --chown=1000:1000 weights/ ./weights/
 # Create directories
 RUN mkdir -p /app/weights /app/avatar/static
-# Fix openface bug
 RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n    open(fp,'w').write('import cv2\\\\n'+c)\\n    print('Patched')\")"
 # Build frontend
@@ -127,12 +95,19 @@ RUN mkdir -p /etc/nginx/certs && \
     -subj "/CN=mrrrme.hf.space"
 # Create startup script
-RUN printf '#!/bin/bash\nset -e\nexport HOME=/tmp\nmkdir -p /tmp\ngit config --global user.name "michon" 2>/dev/null || true\ngit config --global user.email "[email protected]" 2>/dev/null || true\nif [ -d "/data" ] && [ -w "/data" ]; then\n    echo "Persistent storage: /data"\n    chmod 777 /data 2>/dev/null || true\nelse\n    echo "Ephemeral storage: /tmp"\nfi\nexport PIPER_VOICES_DIR=/opt/piper/voices\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\nsleep 2\necho "Starting MrrrMe..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
 # Set ownership
 RUN chown -R 1000:1000 /app
 USER 1000
 EXPOSE 7860

+# Hugging Face Spaces - MrrrMe with Coqui XTTS v2
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 # Install system dependencies
+# espeak-ng is often required for Coqui TTS text processing
 RUN apt-get update && apt-get install -y \
     bash \
     git \
     python3.11-dev \
     libgl1-mesa-glx \
     libglib2.0-0 \
     ffmpeg \
     portaudio19-dev \
     libsndfile1 \
+    espeak-ng \
     nginx \
     gnupg \
     && rm -rf /var/lib/apt/lists/*
 # Install Node.js 20
     apt-get install -y nodejs && \
     rm -rf /var/lib/apt/lists/*
 # Set Python 3.11 as default
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
     torchaudio==2.4.0 \
     --index-url https://download.pytorch.org/whl/cu118
+# Agree to Coqui Terms (Required for XTTS)
+ENV COQUI_TOS_AGREED=1
 # Install Python dependencies
 COPY requirements_docker.txt ./
+RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
+# PRE-DOWNLOAD XTTS V2 MODEL
+# This prevents downloading it at runtime. We run a small script to trigger the download.
+# The model will be stored in /root/.local/share/tts/ by default.
+RUN python3.11 -c "from TTS.api import TTS; print('⏳ Downloading XTTS v2 model...'); TTS('tts_models/multilingual/multi-dataset/xtts_v2'); print('✅ Download complete.')"
+# Install avatar dependencies (FastAPI, etc)
 RUN python3.11 -m pip install --no-cache-dir \
+    fastapi uvicorn python-multipart pydub websockets onnxruntime
 # Copy application code
 COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
 # Create directories
 RUN mkdir -p /app/weights /app/avatar/static
+# Fix openface bug (legacy)
 RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n    open(fp,'w').write('import cv2\\\\n'+c)\\n    print('Patched')\")"
 # Build frontend
     -subj "/CN=mrrrme.hf.space"
 # Create startup script
+RUN printf '#!/bin/bash\nset -e\nexport HOME=/root\nmkdir -p /tmp\n\n# Agree to TOS at runtime as well just in case\nexport COQUI_TOS_AGREED=1\n\nif [ -d "/data" ] && [ -w "/data" ]; then\n    echo "Persistent storage: /data"\n    chmod 777 /data 2>/dev/null || true\nelse\n    echo "Ephemeral storage: /tmp"\nfi\n\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\necho "Starting MrrrMe (XTTS v2 Enabled)..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
 # Set ownership
 RUN chown -R 1000:1000 /app
+# Note: Coqui TTS downloads models to user home.
+# Since we downloaded as root in build, we need to ensure permissions or run as root.
+# For simplicity in Spaces, we might run as root or ensure HOME env var is correct.
+# Here we stick to user 1000 but we must copy the models.
+RUN cp -r /root/.local /home/user/.local || true && chown -R 1000:1000 /home/user
 USER 1000
+ENV HOME=/home/user
 EXPOSE 7860

avatar/speak_server.py CHANGED Viewed

@@ -1,10 +1,8 @@
-"""Avatar Backend - HYBRID TTS (Kokoro for EN, Piper for NL)"""
 import os
-import json
 import uuid
 import time
 import wave
-import subprocess
 from fastapi import FastAPI, Form, WebSocket
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
@@ -13,52 +11,41 @@ from pydub import AudioSegment
 from typing import List
 from dotenv import load_dotenv
 import torch
-import scipy.io.wavfile as wavfile
-import numpy as np
-from kokoro_onnx import Kokoro
 load_dotenv()
 OUT_DIR = "/tmp/avatar_static"
 os.makedirs(OUT_DIR, exist_ok=True)
-# HYBRID Voice Mapping
-KOKORO_VOICES = {
-    "female": "af_sarah",
-    "male": "am_adam"
 }
-PIPER_VOICES = {
-    "female": "nl_NL-mls-medium",
-    "male": "nl_BE-rdh-medium"
-}
-PIPER_PATH = "piper"
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
 active_connections: List[WebSocket] = []
-# Initialize Kokoro for English
-print("[TTS] 🚀 Initializing Kokoro (English)...")
-kokoro_model = Kokoro(
-    model_path="/app/kokoro_models/kokoro-v0_19.onnx",
-    voices_path="/app/kokoro_models/voices.bin"
-)
-print("[TTS] ✅ Kokoro ready")
-# Check Piper for Dutch
-print("[TTS] 🇳🇱 Checking Piper (Dutch)...")
 try:
-    result = subprocess.run([PIPER_PATH, "--version"], capture_output=True, text=True, timeout=5)
-    print(f"[TTS] ✅ Piper ready: {result.stdout.strip()}")
-except:
-    print(f"[TTS] ⚠️ Piper not available")
 def text_to_visemes_simple(text: str, duration: float):
-    """Generate visemes"""
     visemes = []
     words = text.split()
     if not words:
@@ -69,85 +56,28 @@ def text_to_visemes_simple(text: str, duration: float):
     for word in words:
         word_lower = word.lower().strip('.,!?')
         for i, char in enumerate(word_lower):
             char_time = current_time + (i / len(word_lower)) * time_per_word
-            if char in 'aá':
-                visemes.append({"t": round(char_time, 3), "blend": {"jawOpen": 0.6}})
-            elif char in 'eé':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.4, "jawOpen": 0.2}})
-            elif char in 'ií':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.5, "jawOpen": 0.1}})
-            elif char in 'oó':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthFunnel": 0.6, "jawOpen": 0.3}})
-            elif char in 'uú':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.1}})
-            elif char in 'fv':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}})
-            elif char in 'mpb':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.4}})
-            elif char in 'w':
-                visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.2}})
         current_time += time_per_word
     return visemes
-def generate_kokoro_tts(text: str, wav_path: str, voice_name: str):
-    """Generate English audio using Kokoro"""
-    try:
-        print(f"[TTS] 🔧 Kokoro generating ({voice_name})...")
-        start = time.time()
-        audio, sample_rate = kokoro_model.create(text, voice=voice_name, speed=1.0)
-        audio_int16 = (audio * 32767).astype(np.int16)
-        wavfile.write(wav_path, sample_rate, audio_int16)
-        print(f"[TTS] ✅ Kokoro done in {time.time()-start:.2f}s")
-        return True
-    except Exception as e:
-        print(f"[TTS] ❌ Kokoro error: {e}")
-        return False
-def generate_piper_tts(text: str, wav_path: str, voice_name: str):
-    """Generate Dutch audio using Piper"""
-    try:
-        print(f"[TTS] 🔧 Piper generating ({voice_name})...")
-        start = time.time()
-        voices_dir = os.environ.get('PIPER_VOICES_DIR', '/opt/piper/voices')
-        process = subprocess.Popen(
-            [PIPER_PATH, "--model", voice_name, "--data-dir", voices_dir, "--output_file", wav_path],
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True
-        )
-        stdout, stderr = process.communicate(input=text, timeout=30)
-        if process.returncode != 0:
-            print(f"[TTS] ❌ Piper error: {stderr}")
-            return False
-        if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
-            print(f"[TTS] ✅ Piper done in {time.time()-start:.2f}s")
-            return True
-        return False
-    except Exception as e:
-        print(f"[TTS] ❌ Piper error: {e}")
-        return False
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
     active_connections.append(websocket)
-    print(f"[WebSocket] ✅ Client connected. Total: {len(active_connections)}")
     try:
         while True:
             await websocket.receive_text()
@@ -172,38 +102,40 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
     wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
     mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
     print(f"\n{'='*60}")
-    print(f"[Backend] HYBRID TTS")
     print(f"[Backend] Text: '{text}'")
-    print(f"[Backend] Language: {language}, Voice: {voice}")
     try:
-        # Choose TTS engine based on language
-        if language == "nl":
-            # Use Piper for Dutch (better quality)
-            selected_voice = PIPER_VOICES.get(voice, PIPER_VOICES["female"])
-            print(f"[Backend] Using PIPER for Dutch: {selected_voice}")
-            success = generate_piper_tts(text, wav_path, selected_voice)
-        else:
-            # Use Kokoro for English (better quality)
-            selected_voice = KOKORO_VOICES.get(voice, KOKORO_VOICES["female"])
-            print(f"[Backend] Using KOKORO for English: {selected_voice}")
-            success = generate_kokoro_tts(text, wav_path, selected_voice)
-        if not success:
-            raise Exception("TTS generation failed")
         t2 = time.time()
-        print(f"[Backend] Audio generated in {t2-t_start:.2f}s")
-        # Convert to MP3
         try:
             audio = AudioSegment.from_wav(wav_path)
             audio.export(mp3_path, format="mp3", bitrate="128k")
             duration_sec = len(audio) / 1000.0
             os.remove(wav_path)
             audio_file = mp3_path
-        except:
             with wave.open(wav_path, 'rb') as wf:
                 duration_sec = wf.getnframes() / float(wf.getframerate())
             audio_file = wav_path
@@ -221,7 +153,7 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
         return response_data
     except Exception as e:
-        error_msg = f"TTS failed: {str(e)}"
         print(f"[Backend] ❌ {error_msg}")
         return JSONResponse(status_code=500, content={"error": error_msg})
@@ -229,18 +161,12 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
 async def root():
     return {
         "status": "running",
-        "tts_engine": "hybrid",
-        "engines": {
-            "en": "kokoro-onnx",
-            "nl": "piper"
-        },
-        "voices": {
-            "en": KOKORO_VOICES,
-            "nl": PIPER_VOICES
-        }
     }
 if __name__ == "__main__":
     import uvicorn
-    print("🚀 Hybrid TTS Server (Kokoro EN + Piper NL)")
     uvicorn.run(app, host="0.0.0.0", port=8765)

+"""Avatar Backend - Coqui XTTS v2 (Multi-lingual)"""
 import os
 import uuid
 import time
 import wave
 from fastapi import FastAPI, Form, WebSocket
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from typing import List
 from dotenv import load_dotenv
 import torch
+from TTS.api import TTS
 load_dotenv()
 OUT_DIR = "/tmp/avatar_static"
 os.makedirs(OUT_DIR, exist_ok=True)
+# XTTS v2 Standard Speakers
+# These speakers support both English (en) and Dutch (nl) natively
+VOICE_MAP = {
+    "female": "Ana Florence",
+    "male": "Andrew Chipper"
 }
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
 active_connections: List[WebSocket] = []
+# Initialize Coqui XTTS v2
+print("[TTS] 🚀 Initializing Coqui XTTS v2...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"[TTS] 🖥️ Device: {device}")
 try:
+    # This will load the model from the directory defined in Dockerfile or download it
+    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+    print("[TTS] ✅ XTTS v2 model loaded and ready")
+except Exception as e:
+    print(f"[TTS] ❌ FATAL: Could not load XTTS model: {e}")
+    tts = None
 def text_to_visemes_simple(text: str, duration: float):
+    """Generate simple visemes for lip sync (unchanged)"""
     visemes = []
     words = text.split()
     if not words:
     for word in words:
         word_lower = word.lower().strip('.,!?')
         for i, char in enumerate(word_lower):
             char_time = current_time + (i / len(word_lower)) * time_per_word
+            blend = {}
+            if char in 'aá': blend = {"jawOpen": 0.6}
+            elif char in 'eé': blend = {"mouthSmile": 0.4, "jawOpen": 0.2}
+            elif char in 'ií': blend = {"mouthSmile": 0.5, "jawOpen": 0.1}
+            elif char in 'oó': blend = {"mouthFunnel": 0.6, "jawOpen": 0.3}
+            elif char in 'uú': blend = {"mouthPucker": 0.5, "jawOpen": 0.1}
+            elif char in 'fv': blend = {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}
+            elif char in 'mpb': blend = {"mouthPucker": 0.4}
+            elif char in 'w': blend = {"mouthPucker": 0.5, "jawOpen": 0.2}
+            if blend:
+                visemes.append({"t": round(char_time, 3), "blend": blend})
         current_time += time_per_word
     return visemes
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
     active_connections.append(websocket)
     try:
         while True:
             await websocket.receive_text()
     wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
     mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
+    # Map 'female'/'male' to specific XTTS speaker names
+    speaker_name = VOICE_MAP.get(voice, voice)  # Fallback to input if it's already a name
     print(f"\n{'='*60}")
+    print(f"[Backend] XTTS v2 Generation")
     print(f"[Backend] Text: '{text}'")
+    print(f"[Backend] Lang: {language} | Speaker: {speaker_name}")
     try:
+        if tts is None:
+            raise Exception("TTS Model not initialized")
+        # Generate Audio
+        # split_sentences=True is generally better for long text in XTTS
+        tts.tts_to_file(
+            text=text,
+            file_path=wav_path,
+            speaker=speaker_name,
+            language=language,
+            split_sentences=True
+        )
         t2 = time.time()
+        print(f"[Backend] ✅ Generated in {t2-t_start:.2f}s")
+        # Convert to MP3 for web (smaller size)
         try:
             audio = AudioSegment.from_wav(wav_path)
             audio.export(mp3_path, format="mp3", bitrate="128k")
             duration_sec = len(audio) / 1000.0
             os.remove(wav_path)
             audio_file = mp3_path
+        except Exception as e:
+            print(f"[Backend] ⚠️ MP3 conversion failed, using WAV: {e}")
             with wave.open(wav_path, 'rb') as wf:
                 duration_sec = wf.getnframes() / float(wf.getframerate())
             audio_file = wav_path
         return response_data
     except Exception as e:
+        error_msg = f"XTTS failed: {str(e)}"
         print(f"[Backend] ❌ {error_msg}")
         return JSONResponse(status_code=500, content={"error": error_msg})
 async def root():
     return {
         "status": "running",
+        "tts_engine": "coqui-xtts-v2",
+        "languages": ["en", "nl", "fr", "de", "it", "es", "ja", "zh", "pt", "pl", "tr", "ru", "cs", "ar", "hu", "ko"],
+        "voices": VOICE_MAP
     }
 if __name__ == "__main__":
     import uvicorn
+    print("🚀 Avatar Server (Coqui XTTS v2)")
     uvicorn.run(app, host="0.0.0.0", port=8765)

mrrrme/audio/voice_assistant.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Text-to-Speech using Kokoro-82M (PROPER INITIALIZATION)"""
 import os
 import time
 import tempfile
@@ -6,28 +6,28 @@ import threading
 import pygame
 import torch
 import numpy as np
-import scipy.io.wavfile as wavfile
-from pydub import AudioSegment
 from dotenv import load_dotenv
-from kokoro_onnx import Kokoro
 load_dotenv()
 VOICE_MAP = {
-    "female": "af_sarah",
-    "male": "am_adam",
-    "Happy": "af_sarah",
-    "Sad": "af_bella",
-    "Angry": "am_adam",
-    "Neutral": "af_sarah",
 }
 class VoiceAssistant:
-    """Kokoro TTS"""
     def __init__(self, voice: str = "female", rate: float = 1.0, language: str = "en"):
         self.voice_key = voice
-        self.voice_name = VOICE_MAP.get(voice, "af_sarah")
         self.rate = rate
         self.language = language
@@ -36,18 +36,20 @@ class VoiceAssistant:
         self.speaking_lock = threading.Lock()
         self.audio_workers = []
-        print(f"[TTS] 🚀 Using Kokoro-82M TTS")
-        # Initialize Kokoro with paths (models pre-downloaded in Dockerfile)
         try:
-            self.kokoro = Kokoro(
-                model_path="/app/kokoro_models/kokoro-v0_19.onnx",
-                voices_path="/app/kokoro_models/voices.bin"
-            )
-            print(f"[TTS] ✅ Kokoro ready")
         except Exception as e:
-            print(f"[TTS] ⚠️ Kokoro init: {e}")
-            self.kokoro = None
         print("[TTS] 🔧 Initializing pygame...")
         try:
@@ -64,22 +66,35 @@ class VoiceAssistant:
         print(f"[TTS] ✅ Registered: {worker.__class__.__name__}")
     def set_voice(self, voice_key: str):
         if voice_key in VOICE_MAP:
             self.voice_name = VOICE_MAP[voice_key]
             self.voice_key = voice_key
             print(f"[TTS] 🎙️ Voice → {self.voice_name}")
         else:
             self.voice_name = voice_key
     def set_language(self, language: str):
         self.language = language
         print(f"[TTS] 🌍 Language → {language}")
     def set_rate(self, rate: float):
         self.rate = max(0.5, min(2.0, rate))
-        print(f"[TTS] 🎚️ Rate → {self.rate}x")
     def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
         if emotion == "Happy":
             self.rate = 1.1
         elif emotion == "Sad":
@@ -109,26 +124,28 @@ class VoiceAssistant:
     def _get_unique_filename(self, ext: str = ".wav"):
         self.counter += 1
-        return os.path.join(tempfile.gettempdir(), f"kokoro_{self.counter}_{int(time.time() * 1000)}{ext}")
     def _generate_speech(self, text: str, filename: str):
-        """Generate speech using Kokoro"""
         try:
-            # Initialize on first use if needed
-            if self.kokoro is None:
-                print("[TTS] 📥 Initializing Kokoro...")
-                self.kokoro = Kokoro(
-                    model_path="/app/kokoro_models/kokoro-v0_19.onnx",
-                    voices_path="/app/kokoro_models/voices.bin"
-                )
-            print(f"[TTS] 🔧 Generating...")
             start = time.time()
-            # create() returns (audio, sample_rate)
-            audio, sample_rate = self.kokoro.create(text, voice=self.voice_name, speed=self.rate)
-            audio_int16 = (audio * 32767).astype(np.int16)
-            wavfile.write(filename, sample_rate, audio_int16)
             gen_time = time.time() - start
             print(f"[TTS] ✅ Generated in {gen_time:.2f}s")
@@ -166,6 +183,7 @@ class VoiceAssistant:
         print(f"\n[TTS] 🔊 Speaking ({self.language}): '{text[:80]}...'")
         for worker in self.audio_workers:
             if hasattr(worker, 'pause_listening'):
                 try:
@@ -195,6 +213,7 @@ class VoiceAssistant:
             time.sleep(0.2)
             for worker in self.audio_workers:
                 if hasattr(worker, 'resume_listening'):
                     try:

+"""Text-to-Speech using Coqui XTTS v2 (Multi-lingual)"""
 import os
 import time
 import tempfile
 import pygame
 import torch
 import numpy as np
 from dotenv import load_dotenv
+from TTS.api import TTS  # Coqui TTS
 load_dotenv()
+# XTTS v2 Default Speakers
+# These are standard speakers included in the xtts_v2 model.
 VOICE_MAP = {
+    "female": "Ana Florence",
+    "male": "Andrew Chipper",
+    "Happy": "Ana Florence",
+    "Sad": "Ana Florence",
+    "Angry": "Andrew Chipper",
+    "Neutral": "Ana Florence",
 }
 class VoiceAssistant:
+    """Coqui XTTS v2 TTS"""
     def __init__(self, voice: str = "female", rate: float = 1.0, language: str = "en"):
         self.voice_key = voice
+        self.voice_name = VOICE_MAP.get(voice, "Ana Florence")
         self.rate = rate
         self.language = language
         self.speaking_lock = threading.Lock()
         self.audio_workers = []
+        print(f"[TTS] 🚀 Initializing Coqui XTTS v2...")
+        # Initialize Coqui TTS with XTTS v2 model
+        # gpu=True will use CUDA if available
         try:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"[TTS] 📥 Loading XTTS v2 model on {device} (this may take time on first run)...")
+            self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
+            print(f"[TTS] ✅ XTTS v2 model loaded")
         except Exception as e:
+            print(f"[TTS] ⚠️ XTTS init error: {e}")
+            self.tts = None
         print("[TTS] 🔧 Initializing pygame...")
         try:
         print(f"[TTS] ✅ Registered: {worker.__class__.__name__}")
     def set_voice(self, voice_key: str):
+        """Switch between male/female voices"""
         if voice_key in VOICE_MAP:
             self.voice_name = VOICE_MAP[voice_key]
             self.voice_key = voice_key
             print(f"[TTS] 🎙️ Voice → {self.voice_name}")
         else:
+            # If user passes a raw speaker name that exists in XTTS
             self.voice_name = voice_key
+            print(f"[TTS] 🎙️ Voice → {self.voice_name} (Custom)")
     def set_language(self, language: str):
+        """Set language (e.g., 'en', 'nl')"""
         self.language = language
         print(f"[TTS] 🌍 Language → {language}")
     def set_rate(self, rate: float):
+        """
+        Note: XTTS v2 does not natively support speed control via API in the same way.
+        This is kept for compatibility but might not affect generation speed directly.
+        """
         self.rate = max(0.5, min(2.0, rate))
+        print(f"[TTS] 🎚️ Rate → {self.rate}x (XTTS may ignore this)")
     def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
+        """
+        Adjusts internal state based on emotion.
+        Note: XTTS implies emotion via the input text or style transfer (if enabled).
+        For now, we just log it or adjust simple parameters.
+        """
         if emotion == "Happy":
             self.rate = 1.1
         elif emotion == "Sad":
     def _get_unique_filename(self, ext: str = ".wav"):
         self.counter += 1
+        return os.path.join(tempfile.gettempdir(), f"xtts_{self.counter}_{int(time.time() * 1000)}{ext}")
     def _generate_speech(self, text: str, filename: str):
+        """Generate speech using Coqui XTTS v2"""
         try:
+            if self.tts is None:
+                print("[TTS] ❌ Model not initialized")
+                return False
+            print(f"[TTS] 🔧 Generating with {self.voice_name} ({self.language})...")
             start = time.time()
+            # XTTS v2 Generation
+            # We use the named speaker directly.
+            # Note: XTTS usually does not support 'speed' arg in tts_to_file directly.
+            self.tts.tts_to_file(
+                text=text,
+                file_path=filename,
+                speaker=self.voice_name,
+                language=self.language,
+                split_sentences=True
+            )
             gen_time = time.time() - start
             print(f"[TTS] ✅ Generated in {gen_time:.2f}s")
         print(f"\n[TTS] 🔊 Speaking ({self.language}): '{text[:80]}...'")
+        # Pause workers (listening)
         for worker in self.audio_workers:
             if hasattr(worker, 'pause_listening'):
                 try:
             time.sleep(0.2)
+            # Resume workers
             for worker in self.audio_workers:
                 if hasattr(worker, 'resume_listening'):
                     try:

requirements_docker.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-# MrrrMe Backend Requirements - HYBRID TTS (Kokoro EN + Piper NL)
 # Core frameworks
 fastapi==0.115.4
 uvicorn[standard]==0.32.0
@@ -14,7 +14,7 @@ timm==1.0.11
 einops==0.8.0
 # ML/DL
-# NOTE: torch==2.4.0, torchvision==0.19.0, torchaudio==2.4.0 are installed in Dockerfile
 numpy==1.26.4
 scipy==1.13.1
 pandas==2.2.3
@@ -31,10 +31,10 @@ pydub==0.25.1
 sounddevice==0.5.1
 webrtcvad==2.0.10
-# Text-to-Speech (Piper for Dutch - installed separately)
-# Kokoro for English - installed separately
-onnxruntime==1.16.3
-pygame==2.6.1
 # Text processing
 nltk==3.9.1

+# MrrrMe Backend Requirements - Coqui XTTS v2
 # Core frameworks
 fastapi==0.115.4
 uvicorn[standard]==0.32.0
 einops==0.8.0
 # ML/DL
+# NOTE: torch, torchvision, torchaudio are installed in Dockerfile with CUDA support
 numpy==1.26.4
 scipy==1.13.1
 pandas==2.2.3
 sounddevice==0.5.1
 webrtcvad==2.0.10
+# Text-to-Speech
+coqui-tts==0.24.2
+# Note: coqui-tts pulls in a lot of deps, but we constrain torch in Dockerfile.
+# If build fails on conflicts, remove the version constraint here.
 # Text processing
 nltk==3.9.1