Spaces:
Sleeping
Sleeping
MichonGoddijn231849
commited on
Commit
Β·
86bb0f2
1
Parent(s):
387a013
coqui xtts instead of kokoro + piper
Browse files- Dockerfile +23 -48
- avatar/speak_server.py +55 -129
- mrrrme/audio/voice_assistant.py +55 -36
- requirements_docker.txt +6 -6
Dockerfile
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
# Hugging Face Spaces - MrrrMe with
|
| 2 |
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
|
| 3 |
|
| 4 |
# Install system dependencies
|
|
|
|
| 5 |
RUN apt-get update && apt-get install -y \
|
| 6 |
bash \
|
| 7 |
git \
|
|
@@ -14,17 +15,12 @@ RUN apt-get update && apt-get install -y \
|
|
| 14 |
python3.11-dev \
|
| 15 |
libgl1-mesa-glx \
|
| 16 |
libglib2.0-0 \
|
| 17 |
-
libsm6 \
|
| 18 |
-
libxext6 \
|
| 19 |
ffmpeg \
|
| 20 |
portaudio19-dev \
|
| 21 |
libsndfile1 \
|
|
|
|
| 22 |
nginx \
|
| 23 |
gnupg \
|
| 24 |
-
htop \
|
| 25 |
-
vim \
|
| 26 |
-
nano \
|
| 27 |
-
unzip \
|
| 28 |
&& rm -rf /var/lib/apt/lists/*
|
| 29 |
|
| 30 |
# Install Node.js 20
|
|
@@ -32,22 +28,6 @@ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
|
|
| 32 |
apt-get install -y nodejs && \
|
| 33 |
rm -rf /var/lib/apt/lists/*
|
| 34 |
|
| 35 |
-
# Install Piper TTS for Dutch
|
| 36 |
-
RUN wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz -O /tmp/piper.tar.gz && \
|
| 37 |
-
tar -xzf /tmp/piper.tar.gz -C /opt && \
|
| 38 |
-
ln -s /opt/piper/piper /usr/local/bin/piper && \
|
| 39 |
-
chmod +x /opt/piper/piper && \
|
| 40 |
-
rm /tmp/piper.tar.gz
|
| 41 |
-
|
| 42 |
-
# Download Dutch Piper voices
|
| 43 |
-
RUN mkdir -p /opt/piper/voices && cd /opt/piper/voices && \
|
| 44 |
-
wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx && \
|
| 45 |
-
wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx.json && \
|
| 46 |
-
wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_BE/rdh/medium/nl_BE-rdh-medium.onnx && \
|
| 47 |
-
wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_BE/rdh/medium/nl_BE-rdh-medium.onnx.json
|
| 48 |
-
|
| 49 |
-
ENV PIPER_VOICES_DIR=/opt/piper/voices
|
| 50 |
-
|
| 51 |
# Set Python 3.11 as default
|
| 52 |
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
|
| 53 |
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
|
|
@@ -61,33 +41,21 @@ RUN python3.11 -m pip install --no-cache-dir \
|
|
| 61 |
torchaudio==2.4.0 \
|
| 62 |
--index-url https://download.pytorch.org/whl/cu118
|
| 63 |
|
| 64 |
-
#
|
| 65 |
-
|
| 66 |
-
echo "torchvision==0.19.0" >> /tmp/constraints.txt && \
|
| 67 |
-
echo "torchaudio==2.4.0" >> /tmp/constraints.txt && \
|
| 68 |
-
echo "httpx<0.28.0" >> /tmp/constraints.txt
|
| 69 |
|
| 70 |
# Install Python dependencies
|
| 71 |
COPY requirements_docker.txt ./
|
| 72 |
-
RUN
|
| 73 |
-
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
import urllib.request; \
|
| 81 |
-
os.makedirs('/app/kokoro_models', exist_ok=True); \
|
| 82 |
-
print('Downloading Kokoro models...'); \
|
| 83 |
-
urllib.request.urlretrieve('https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx', '/app/kokoro_models/kokoro-v0_19.onnx'); \
|
| 84 |
-
urllib.request.urlretrieve('https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin', '/app/kokoro_models/voices.bin'); \
|
| 85 |
-
print('Kokoro ready'); \
|
| 86 |
-
"
|
| 87 |
-
|
| 88 |
-
# Install avatar dependencies
|
| 89 |
RUN python3.11 -m pip install --no-cache-dir \
|
| 90 |
-
fastapi uvicorn python-multipart pydub websockets onnxruntime
|
| 91 |
|
| 92 |
# Copy application code
|
| 93 |
COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
|
|
@@ -98,7 +66,7 @@ COPY --link --chown=1000:1000 weights/ ./weights/
|
|
| 98 |
# Create directories
|
| 99 |
RUN mkdir -p /app/weights /app/avatar/static
|
| 100 |
|
| 101 |
-
# Fix openface bug
|
| 102 |
RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n open(fp,'w').write('import cv2\\\\n'+c)\\n print('Patched')\")"
|
| 103 |
|
| 104 |
# Build frontend
|
|
@@ -127,12 +95,19 @@ RUN mkdir -p /etc/nginx/certs && \
|
|
| 127 |
-subj "/CN=mrrrme.hf.space"
|
| 128 |
|
| 129 |
# Create startup script
|
| 130 |
-
RUN printf '#!/bin/bash\nset -e\nexport HOME=/
|
| 131 |
|
| 132 |
# Set ownership
|
| 133 |
RUN chown -R 1000:1000 /app
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
USER 1000
|
|
|
|
| 136 |
|
| 137 |
EXPOSE 7860
|
| 138 |
|
|
|
|
| 1 |
+
# Hugging Face Spaces - MrrrMe with Coqui XTTS v2
|
| 2 |
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
|
| 3 |
|
| 4 |
# Install system dependencies
|
| 5 |
+
# espeak-ng is often required for Coqui TTS text processing
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
bash \
|
| 8 |
git \
|
|
|
|
| 15 |
python3.11-dev \
|
| 16 |
libgl1-mesa-glx \
|
| 17 |
libglib2.0-0 \
|
|
|
|
|
|
|
| 18 |
ffmpeg \
|
| 19 |
portaudio19-dev \
|
| 20 |
libsndfile1 \
|
| 21 |
+
espeak-ng \
|
| 22 |
nginx \
|
| 23 |
gnupg \
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
&& rm -rf /var/lib/apt/lists/*
|
| 25 |
|
| 26 |
# Install Node.js 20
|
|
|
|
| 28 |
apt-get install -y nodejs && \
|
| 29 |
rm -rf /var/lib/apt/lists/*
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# Set Python 3.11 as default
|
| 32 |
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
|
| 33 |
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
|
|
|
|
| 41 |
torchaudio==2.4.0 \
|
| 42 |
--index-url https://download.pytorch.org/whl/cu118
|
| 43 |
|
| 44 |
+
# Agree to Coqui Terms (Required for XTTS)
|
| 45 |
+
ENV COQUI_TOS_AGREED=1
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Install Python dependencies
|
| 48 |
COPY requirements_docker.txt ./
|
| 49 |
+
RUN python3.11 -m pip install --no-cache-dir -r requirements_docker.txt
|
| 50 |
+
|
| 51 |
+
# PRE-DOWNLOAD XTTS V2 MODEL
|
| 52 |
+
# This prevents downloading it at runtime. We run a small script to trigger the download.
|
| 53 |
+
# The model will be stored in /root/.local/share/tts/ by default.
|
| 54 |
+
RUN python3.11 -c "from TTS.api import TTS; print('β³ Downloading XTTS v2 model...'); TTS('tts_models/multilingual/multi-dataset/xtts_v2'); print('β
Download complete.')"
|
| 55 |
+
|
| 56 |
+
# Install avatar dependencies (FastAPI, etc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
RUN python3.11 -m pip install --no-cache-dir \
|
| 58 |
+
fastapi uvicorn python-multipart pydub websockets onnxruntime
|
| 59 |
|
| 60 |
# Copy application code
|
| 61 |
COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
|
|
|
|
| 66 |
# Create directories
|
| 67 |
RUN mkdir -p /app/weights /app/avatar/static
|
| 68 |
|
| 69 |
+
# Fix openface bug (legacy)
|
| 70 |
RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n open(fp,'w').write('import cv2\\\\n'+c)\\n print('Patched')\")"
|
| 71 |
|
| 72 |
# Build frontend
|
|
|
|
| 95 |
-subj "/CN=mrrrme.hf.space"
|
| 96 |
|
| 97 |
# Create startup script
|
| 98 |
+
RUN printf '#!/bin/bash\nset -e\nexport HOME=/root\nmkdir -p /tmp\n\n# Agree to TOS at runtime as well just in case\nexport COQUI_TOS_AGREED=1\n\nif [ -d "/data" ] && [ -w "/data" ]; then\n echo "Persistent storage: /data"\n chmod 777 /data 2>/dev/null || true\nelse\n echo "Ephemeral storage: /tmp"\nfi\n\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\n\nsleep 2\necho "Starting MrrrMe (XTTS v2 Enabled)..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
|
| 99 |
|
| 100 |
# Set ownership
|
| 101 |
RUN chown -R 1000:1000 /app
|
| 102 |
|
| 103 |
+
# Note: Coqui TTS downloads models to user home.
|
| 104 |
+
# Since we downloaded as root in build, we need to ensure permissions or run as root.
|
| 105 |
+
# For simplicity in Spaces, we might run as root or ensure HOME env var is correct.
|
| 106 |
+
# Here we stick to user 1000 but we must copy the models.
|
| 107 |
+
RUN cp -r /root/.local /home/user/.local || true && chown -R 1000:1000 /home/user
|
| 108 |
+
|
| 109 |
USER 1000
|
| 110 |
+
ENV HOME=/home/user
|
| 111 |
|
| 112 |
EXPOSE 7860
|
| 113 |
|
avatar/speak_server.py
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
-
"""Avatar Backend -
|
| 2 |
import os
|
| 3 |
-
import json
|
| 4 |
import uuid
|
| 5 |
import time
|
| 6 |
import wave
|
| 7 |
-
import subprocess
|
| 8 |
from fastapi import FastAPI, Form, WebSocket
|
| 9 |
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
from fastapi.staticfiles import StaticFiles
|
|
@@ -13,52 +11,41 @@ from pydub import AudioSegment
|
|
| 13 |
from typing import List
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
import torch
|
| 16 |
-
|
| 17 |
-
import numpy as np
|
| 18 |
-
from kokoro_onnx import Kokoro
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
|
| 22 |
OUT_DIR = "/tmp/avatar_static"
|
| 23 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
"
|
|
|
|
| 29 |
}
|
| 30 |
|
| 31 |
-
PIPER_VOICES = {
|
| 32 |
-
"female": "nl_NL-mls-medium",
|
| 33 |
-
"male": "nl_BE-rdh-medium"
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
PIPER_PATH = "piper"
|
| 37 |
-
|
| 38 |
app = FastAPI()
|
| 39 |
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
| 40 |
app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
|
| 41 |
|
| 42 |
active_connections: List[WebSocket] = []
|
| 43 |
|
| 44 |
-
# Initialize
|
| 45 |
-
print("[TTS] π Initializing
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
voices_path="/app/kokoro_models/voices.bin"
|
| 49 |
-
)
|
| 50 |
-
print("[TTS] β
Kokoro ready")
|
| 51 |
|
| 52 |
-
# Check Piper for Dutch
|
| 53 |
-
print("[TTS] π³π± Checking Piper (Dutch)...")
|
| 54 |
try:
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def text_to_visemes_simple(text: str, duration: float):
|
| 61 |
-
"""Generate visemes"""
|
| 62 |
visemes = []
|
| 63 |
words = text.split()
|
| 64 |
if not words:
|
|
@@ -69,85 +56,28 @@ def text_to_visemes_simple(text: str, duration: float):
|
|
| 69 |
|
| 70 |
for word in words:
|
| 71 |
word_lower = word.lower().strip('.,!?')
|
| 72 |
-
|
| 73 |
for i, char in enumerate(word_lower):
|
| 74 |
char_time = current_time + (i / len(word_lower)) * time_per_word
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
if
|
| 77 |
-
visemes.append({"t": round(char_time, 3), "blend":
|
| 78 |
-
elif char in 'eΓ©':
|
| 79 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.4, "jawOpen": 0.2}})
|
| 80 |
-
elif char in 'iΓ':
|
| 81 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.5, "jawOpen": 0.1}})
|
| 82 |
-
elif char in 'oΓ³':
|
| 83 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthFunnel": 0.6, "jawOpen": 0.3}})
|
| 84 |
-
elif char in 'uΓΊ':
|
| 85 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.1}})
|
| 86 |
-
elif char in 'fv':
|
| 87 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}})
|
| 88 |
-
elif char in 'mpb':
|
| 89 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.4}})
|
| 90 |
-
elif char in 'w':
|
| 91 |
-
visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.2}})
|
| 92 |
-
|
| 93 |
current_time += time_per_word
|
| 94 |
|
| 95 |
return visemes
|
| 96 |
|
| 97 |
-
def generate_kokoro_tts(text: str, wav_path: str, voice_name: str):
|
| 98 |
-
"""Generate English audio using Kokoro"""
|
| 99 |
-
try:
|
| 100 |
-
print(f"[TTS] π§ Kokoro generating ({voice_name})...")
|
| 101 |
-
start = time.time()
|
| 102 |
-
|
| 103 |
-
audio, sample_rate = kokoro_model.create(text, voice=voice_name, speed=1.0)
|
| 104 |
-
audio_int16 = (audio * 32767).astype(np.int16)
|
| 105 |
-
wavfile.write(wav_path, sample_rate, audio_int16)
|
| 106 |
-
|
| 107 |
-
print(f"[TTS] β
Kokoro done in {time.time()-start:.2f}s")
|
| 108 |
-
return True
|
| 109 |
-
|
| 110 |
-
except Exception as e:
|
| 111 |
-
print(f"[TTS] β Kokoro error: {e}")
|
| 112 |
-
return False
|
| 113 |
-
|
| 114 |
-
def generate_piper_tts(text: str, wav_path: str, voice_name: str):
|
| 115 |
-
"""Generate Dutch audio using Piper"""
|
| 116 |
-
try:
|
| 117 |
-
print(f"[TTS] π§ Piper generating ({voice_name})...")
|
| 118 |
-
start = time.time()
|
| 119 |
-
|
| 120 |
-
voices_dir = os.environ.get('PIPER_VOICES_DIR', '/opt/piper/voices')
|
| 121 |
-
|
| 122 |
-
process = subprocess.Popen(
|
| 123 |
-
[PIPER_PATH, "--model", voice_name, "--data-dir", voices_dir, "--output_file", wav_path],
|
| 124 |
-
stdin=subprocess.PIPE,
|
| 125 |
-
stdout=subprocess.PIPE,
|
| 126 |
-
stderr=subprocess.PIPE,
|
| 127 |
-
text=True
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
stdout, stderr = process.communicate(input=text, timeout=30)
|
| 131 |
-
|
| 132 |
-
if process.returncode != 0:
|
| 133 |
-
print(f"[TTS] β Piper error: {stderr}")
|
| 134 |
-
return False
|
| 135 |
-
|
| 136 |
-
if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
|
| 137 |
-
print(f"[TTS] β
Piper done in {time.time()-start:.2f}s")
|
| 138 |
-
return True
|
| 139 |
-
|
| 140 |
-
return False
|
| 141 |
-
|
| 142 |
-
except Exception as e:
|
| 143 |
-
print(f"[TTS] β Piper error: {e}")
|
| 144 |
-
return False
|
| 145 |
-
|
| 146 |
@app.websocket("/ws")
|
| 147 |
async def websocket_endpoint(websocket: WebSocket):
|
| 148 |
await websocket.accept()
|
| 149 |
active_connections.append(websocket)
|
| 150 |
-
print(f"[WebSocket] β
Client connected. Total: {len(active_connections)}")
|
| 151 |
try:
|
| 152 |
while True:
|
| 153 |
await websocket.receive_text()
|
|
@@ -172,38 +102,40 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
|
|
| 172 |
wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
|
| 173 |
mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
|
| 174 |
|
|
|
|
|
|
|
|
|
|
| 175 |
print(f"\n{'='*60}")
|
| 176 |
-
print(f"[Backend]
|
| 177 |
print(f"[Backend] Text: '{text}'")
|
| 178 |
-
print(f"[Backend]
|
| 179 |
|
| 180 |
try:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
if not success:
|
| 194 |
-
raise Exception("TTS generation failed")
|
| 195 |
|
| 196 |
t2 = time.time()
|
| 197 |
-
print(f"[Backend]
|
| 198 |
|
| 199 |
-
# Convert to MP3
|
| 200 |
try:
|
| 201 |
audio = AudioSegment.from_wav(wav_path)
|
| 202 |
audio.export(mp3_path, format="mp3", bitrate="128k")
|
| 203 |
duration_sec = len(audio) / 1000.0
|
| 204 |
os.remove(wav_path)
|
| 205 |
audio_file = mp3_path
|
| 206 |
-
except:
|
|
|
|
| 207 |
with wave.open(wav_path, 'rb') as wf:
|
| 208 |
duration_sec = wf.getnframes() / float(wf.getframerate())
|
| 209 |
audio_file = wav_path
|
|
@@ -221,7 +153,7 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
|
|
| 221 |
return response_data
|
| 222 |
|
| 223 |
except Exception as e:
|
| 224 |
-
error_msg = f"
|
| 225 |
print(f"[Backend] β {error_msg}")
|
| 226 |
return JSONResponse(status_code=500, content={"error": error_msg})
|
| 227 |
|
|
@@ -229,18 +161,12 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
|
|
| 229 |
async def root():
|
| 230 |
return {
|
| 231 |
"status": "running",
|
| 232 |
-
"tts_engine": "
|
| 233 |
-
"
|
| 234 |
-
|
| 235 |
-
"nl": "piper"
|
| 236 |
-
},
|
| 237 |
-
"voices": {
|
| 238 |
-
"en": KOKORO_VOICES,
|
| 239 |
-
"nl": PIPER_VOICES
|
| 240 |
-
}
|
| 241 |
}
|
| 242 |
|
| 243 |
if __name__ == "__main__":
|
| 244 |
import uvicorn
|
| 245 |
-
print("π
|
| 246 |
uvicorn.run(app, host="0.0.0.0", port=8765)
|
|
|
|
| 1 |
+
"""Avatar Backend - Coqui XTTS v2 (Multi-lingual)"""
|
| 2 |
import os
|
|
|
|
| 3 |
import uuid
|
| 4 |
import time
|
| 5 |
import wave
|
|
|
|
| 6 |
from fastapi import FastAPI, Form, WebSocket
|
| 7 |
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
| 11 |
from typing import List
|
| 12 |
from dotenv import load_dotenv
|
| 13 |
import torch
|
| 14 |
+
from TTS.api import TTS
|
|
|
|
|
|
|
| 15 |
|
| 16 |
load_dotenv()
|
| 17 |
|
| 18 |
OUT_DIR = "/tmp/avatar_static"
|
| 19 |
os.makedirs(OUT_DIR, exist_ok=True)
|
| 20 |
|
| 21 |
+
# XTTS v2 Standard Speakers
|
| 22 |
+
# These speakers support both English (en) and Dutch (nl) natively
|
| 23 |
+
VOICE_MAP = {
|
| 24 |
+
"female": "Ana Florence",
|
| 25 |
+
"male": "Andrew Chipper"
|
| 26 |
}
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
app = FastAPI()
|
| 29 |
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
| 30 |
app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
|
| 31 |
|
| 32 |
active_connections: List[WebSocket] = []
|
| 33 |
|
| 34 |
+
# Initialize Coqui XTTS v2
|
| 35 |
+
print("[TTS] π Initializing Coqui XTTS v2...")
|
| 36 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 37 |
+
print(f"[TTS] π₯οΈ Device: {device}")
|
|
|
|
|
|
|
|
|
|
| 38 |
|
|
|
|
|
|
|
| 39 |
try:
|
| 40 |
+
# This will load the model from the directory defined in Dockerfile or download it
|
| 41 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 42 |
+
print("[TTS] β
XTTS v2 model loaded and ready")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"[TTS] β FATAL: Could not load XTTS model: {e}")
|
| 45 |
+
tts = None
|
| 46 |
|
| 47 |
def text_to_visemes_simple(text: str, duration: float):
|
| 48 |
+
"""Generate simple visemes for lip sync (unchanged)"""
|
| 49 |
visemes = []
|
| 50 |
words = text.split()
|
| 51 |
if not words:
|
|
|
|
| 56 |
|
| 57 |
for word in words:
|
| 58 |
word_lower = word.lower().strip('.,!?')
|
|
|
|
| 59 |
for i, char in enumerate(word_lower):
|
| 60 |
char_time = current_time + (i / len(word_lower)) * time_per_word
|
| 61 |
+
blend = {}
|
| 62 |
+
if char in 'aΓ‘': blend = {"jawOpen": 0.6}
|
| 63 |
+
elif char in 'eΓ©': blend = {"mouthSmile": 0.4, "jawOpen": 0.2}
|
| 64 |
+
elif char in 'iΓ': blend = {"mouthSmile": 0.5, "jawOpen": 0.1}
|
| 65 |
+
elif char in 'oΓ³': blend = {"mouthFunnel": 0.6, "jawOpen": 0.3}
|
| 66 |
+
elif char in 'uΓΊ': blend = {"mouthPucker": 0.5, "jawOpen": 0.1}
|
| 67 |
+
elif char in 'fv': blend = {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}
|
| 68 |
+
elif char in 'mpb': blend = {"mouthPucker": 0.4}
|
| 69 |
+
elif char in 'w': blend = {"mouthPucker": 0.5, "jawOpen": 0.2}
|
| 70 |
|
| 71 |
+
if blend:
|
| 72 |
+
visemes.append({"t": round(char_time, 3), "blend": blend})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
current_time += time_per_word
|
| 74 |
|
| 75 |
return visemes
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
@app.websocket("/ws")
|
| 78 |
async def websocket_endpoint(websocket: WebSocket):
|
| 79 |
await websocket.accept()
|
| 80 |
active_connections.append(websocket)
|
|
|
|
| 81 |
try:
|
| 82 |
while True:
|
| 83 |
await websocket.receive_text()
|
|
|
|
| 102 |
wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
|
| 103 |
mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
|
| 104 |
|
| 105 |
+
# Map 'female'/'male' to specific XTTS speaker names
|
| 106 |
+
speaker_name = VOICE_MAP.get(voice, voice) # Fallback to input if it's already a name
|
| 107 |
+
|
| 108 |
print(f"\n{'='*60}")
|
| 109 |
+
print(f"[Backend] XTTS v2 Generation")
|
| 110 |
print(f"[Backend] Text: '{text}'")
|
| 111 |
+
print(f"[Backend] Lang: {language} | Speaker: {speaker_name}")
|
| 112 |
|
| 113 |
try:
|
| 114 |
+
if tts is None:
|
| 115 |
+
raise Exception("TTS Model not initialized")
|
| 116 |
+
|
| 117 |
+
# Generate Audio
|
| 118 |
+
# split_sentences=True is generally better for long text in XTTS
|
| 119 |
+
tts.tts_to_file(
|
| 120 |
+
text=text,
|
| 121 |
+
file_path=wav_path,
|
| 122 |
+
speaker=speaker_name,
|
| 123 |
+
language=language,
|
| 124 |
+
split_sentences=True
|
| 125 |
+
)
|
|
|
|
|
|
|
| 126 |
|
| 127 |
t2 = time.time()
|
| 128 |
+
print(f"[Backend] β
Generated in {t2-t_start:.2f}s")
|
| 129 |
|
| 130 |
+
# Convert to MP3 for web (smaller size)
|
| 131 |
try:
|
| 132 |
audio = AudioSegment.from_wav(wav_path)
|
| 133 |
audio.export(mp3_path, format="mp3", bitrate="128k")
|
| 134 |
duration_sec = len(audio) / 1000.0
|
| 135 |
os.remove(wav_path)
|
| 136 |
audio_file = mp3_path
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"[Backend] β οΈ MP3 conversion failed, using WAV: {e}")
|
| 139 |
with wave.open(wav_path, 'rb') as wf:
|
| 140 |
duration_sec = wf.getnframes() / float(wf.getframerate())
|
| 141 |
audio_file = wav_path
|
|
|
|
| 153 |
return response_data
|
| 154 |
|
| 155 |
except Exception as e:
|
| 156 |
+
error_msg = f"XTTS failed: {str(e)}"
|
| 157 |
print(f"[Backend] β {error_msg}")
|
| 158 |
return JSONResponse(status_code=500, content={"error": error_msg})
|
| 159 |
|
|
|
|
| 161 |
async def root():
|
| 162 |
return {
|
| 163 |
"status": "running",
|
| 164 |
+
"tts_engine": "coqui-xtts-v2",
|
| 165 |
+
"languages": ["en", "nl", "fr", "de", "it", "es", "ja", "zh", "pt", "pl", "tr", "ru", "cs", "ar", "hu", "ko"],
|
| 166 |
+
"voices": VOICE_MAP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
}
|
| 168 |
|
| 169 |
if __name__ == "__main__":
|
| 170 |
import uvicorn
|
| 171 |
+
print("π Avatar Server (Coqui XTTS v2)")
|
| 172 |
uvicorn.run(app, host="0.0.0.0", port=8765)
|
mrrrme/audio/voice_assistant.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Text-to-Speech using
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
import tempfile
|
|
@@ -6,28 +6,28 @@ import threading
|
|
| 6 |
import pygame
|
| 7 |
import torch
|
| 8 |
import numpy as np
|
| 9 |
-
import scipy.io.wavfile as wavfile
|
| 10 |
-
from pydub import AudioSegment
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
-
from
|
| 13 |
|
| 14 |
load_dotenv()
|
| 15 |
|
|
|
|
|
|
|
| 16 |
VOICE_MAP = {
|
| 17 |
-
"female": "
|
| 18 |
-
"male": "
|
| 19 |
-
"Happy": "
|
| 20 |
-
"Sad": "
|
| 21 |
-
"Angry": "
|
| 22 |
-
"Neutral": "
|
| 23 |
}
|
| 24 |
|
| 25 |
class VoiceAssistant:
|
| 26 |
-
"""
|
| 27 |
|
| 28 |
def __init__(self, voice: str = "female", rate: float = 1.0, language: str = "en"):
|
| 29 |
self.voice_key = voice
|
| 30 |
-
self.voice_name = VOICE_MAP.get(voice, "
|
| 31 |
self.rate = rate
|
| 32 |
self.language = language
|
| 33 |
|
|
@@ -36,18 +36,20 @@ class VoiceAssistant:
|
|
| 36 |
self.speaking_lock = threading.Lock()
|
| 37 |
self.audio_workers = []
|
| 38 |
|
| 39 |
-
print(f"[TTS] π
|
| 40 |
|
| 41 |
-
# Initialize
|
|
|
|
| 42 |
try:
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
)
|
| 47 |
-
|
|
|
|
| 48 |
except Exception as e:
|
| 49 |
-
print(f"[TTS] β οΈ
|
| 50 |
-
self.
|
| 51 |
|
| 52 |
print("[TTS] π§ Initializing pygame...")
|
| 53 |
try:
|
|
@@ -64,22 +66,35 @@ class VoiceAssistant:
|
|
| 64 |
print(f"[TTS] β
Registered: {worker.__class__.__name__}")
|
| 65 |
|
| 66 |
def set_voice(self, voice_key: str):
|
|
|
|
| 67 |
if voice_key in VOICE_MAP:
|
| 68 |
self.voice_name = VOICE_MAP[voice_key]
|
| 69 |
self.voice_key = voice_key
|
| 70 |
print(f"[TTS] ποΈ Voice β {self.voice_name}")
|
| 71 |
else:
|
|
|
|
| 72 |
self.voice_name = voice_key
|
|
|
|
| 73 |
|
| 74 |
def set_language(self, language: str):
|
|
|
|
| 75 |
self.language = language
|
| 76 |
print(f"[TTS] π Language β {language}")
|
| 77 |
|
| 78 |
def set_rate(self, rate: float):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
self.rate = max(0.5, min(2.0, rate))
|
| 80 |
-
print(f"[TTS] ποΈ Rate β {self.rate}x")
|
| 81 |
|
| 82 |
def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
if emotion == "Happy":
|
| 84 |
self.rate = 1.1
|
| 85 |
elif emotion == "Sad":
|
|
@@ -109,26 +124,28 @@ class VoiceAssistant:
|
|
| 109 |
|
| 110 |
def _get_unique_filename(self, ext: str = ".wav"):
|
| 111 |
self.counter += 1
|
| 112 |
-
return os.path.join(tempfile.gettempdir(), f"
|
| 113 |
|
| 114 |
def _generate_speech(self, text: str, filename: str):
|
| 115 |
-
"""Generate speech using
|
| 116 |
try:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
self.kokoro = Kokoro(
|
| 121 |
-
model_path="/app/kokoro_models/kokoro-v0_19.onnx",
|
| 122 |
-
voices_path="/app/kokoro_models/voices.bin"
|
| 123 |
-
)
|
| 124 |
|
| 125 |
-
print(f"[TTS] π§ Generating...")
|
| 126 |
start = time.time()
|
| 127 |
|
| 128 |
-
#
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
gen_time = time.time() - start
|
| 134 |
print(f"[TTS] β
Generated in {gen_time:.2f}s")
|
|
@@ -166,6 +183,7 @@ class VoiceAssistant:
|
|
| 166 |
|
| 167 |
print(f"\n[TTS] π Speaking ({self.language}): '{text[:80]}...'")
|
| 168 |
|
|
|
|
| 169 |
for worker in self.audio_workers:
|
| 170 |
if hasattr(worker, 'pause_listening'):
|
| 171 |
try:
|
|
@@ -195,6 +213,7 @@ class VoiceAssistant:
|
|
| 195 |
|
| 196 |
time.sleep(0.2)
|
| 197 |
|
|
|
|
| 198 |
for worker in self.audio_workers:
|
| 199 |
if hasattr(worker, 'resume_listening'):
|
| 200 |
try:
|
|
|
|
| 1 |
+
"""Text-to-Speech using Coqui XTTS v2 (Multi-lingual)"""
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
import tempfile
|
|
|
|
| 6 |
import pygame
|
| 7 |
import torch
|
| 8 |
import numpy as np
|
|
|
|
|
|
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
+
from TTS.api import TTS # Coqui TTS
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
+
# XTTS v2 Default Speakers
|
| 15 |
+
# These are standard speakers included in the xtts_v2 model.
|
| 16 |
VOICE_MAP = {
|
| 17 |
+
"female": "Ana Florence",
|
| 18 |
+
"male": "Andrew Chipper",
|
| 19 |
+
"Happy": "Ana Florence",
|
| 20 |
+
"Sad": "Ana Florence",
|
| 21 |
+
"Angry": "Andrew Chipper",
|
| 22 |
+
"Neutral": "Ana Florence",
|
| 23 |
}
|
| 24 |
|
| 25 |
class VoiceAssistant:
|
| 26 |
+
"""Coqui XTTS v2 TTS"""
|
| 27 |
|
| 28 |
def __init__(self, voice: str = "female", rate: float = 1.0, language: str = "en"):
|
| 29 |
self.voice_key = voice
|
| 30 |
+
self.voice_name = VOICE_MAP.get(voice, "Ana Florence")
|
| 31 |
self.rate = rate
|
| 32 |
self.language = language
|
| 33 |
|
|
|
|
| 36 |
self.speaking_lock = threading.Lock()
|
| 37 |
self.audio_workers = []
|
| 38 |
|
| 39 |
+
print(f"[TTS] π Initializing Coqui XTTS v2...")
|
| 40 |
|
| 41 |
+
# Initialize Coqui TTS with XTTS v2 model
|
| 42 |
+
# gpu=True will use CUDA if available
|
| 43 |
try:
|
| 44 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 45 |
+
print(f"[TTS] π₯ Loading XTTS v2 model on {device} (this may take time on first run)...")
|
| 46 |
+
|
| 47 |
+
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 48 |
+
|
| 49 |
+
print(f"[TTS] β
XTTS v2 model loaded")
|
| 50 |
except Exception as e:
|
| 51 |
+
print(f"[TTS] β οΈ XTTS init error: {e}")
|
| 52 |
+
self.tts = None
|
| 53 |
|
| 54 |
print("[TTS] π§ Initializing pygame...")
|
| 55 |
try:
|
|
|
|
| 66 |
print(f"[TTS] β
Registered: {worker.__class__.__name__}")
|
| 67 |
|
| 68 |
def set_voice(self, voice_key: str):
|
| 69 |
+
"""Switch between male/female voices"""
|
| 70 |
if voice_key in VOICE_MAP:
|
| 71 |
self.voice_name = VOICE_MAP[voice_key]
|
| 72 |
self.voice_key = voice_key
|
| 73 |
print(f"[TTS] ποΈ Voice β {self.voice_name}")
|
| 74 |
else:
|
| 75 |
+
# If user passes a raw speaker name that exists in XTTS
|
| 76 |
self.voice_name = voice_key
|
| 77 |
+
print(f"[TTS] ποΈ Voice β {self.voice_name} (Custom)")
|
| 78 |
|
| 79 |
def set_language(self, language: str):
|
| 80 |
+
"""Set language (e.g., 'en', 'nl')"""
|
| 81 |
self.language = language
|
| 82 |
print(f"[TTS] π Language β {language}")
|
| 83 |
|
| 84 |
def set_rate(self, rate: float):
|
| 85 |
+
"""
|
| 86 |
+
Note: XTTS v2 does not natively support speed control via API in the same way.
|
| 87 |
+
This is kept for compatibility but might not affect generation speed directly.
|
| 88 |
+
"""
|
| 89 |
self.rate = max(0.5, min(2.0, rate))
|
| 90 |
+
print(f"[TTS] ποΈ Rate β {self.rate}x (XTTS may ignore this)")
|
| 91 |
|
| 92 |
def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
|
| 93 |
+
"""
|
| 94 |
+
Adjusts internal state based on emotion.
|
| 95 |
+
Note: XTTS implies emotion via the input text or style transfer (if enabled).
|
| 96 |
+
For now, we just log it or adjust simple parameters.
|
| 97 |
+
"""
|
| 98 |
if emotion == "Happy":
|
| 99 |
self.rate = 1.1
|
| 100 |
elif emotion == "Sad":
|
|
|
|
| 124 |
|
| 125 |
def _get_unique_filename(self, ext: str = ".wav"):
|
| 126 |
self.counter += 1
|
| 127 |
+
return os.path.join(tempfile.gettempdir(), f"xtts_{self.counter}_{int(time.time() * 1000)}{ext}")
|
| 128 |
|
| 129 |
def _generate_speech(self, text: str, filename: str):
|
| 130 |
+
"""Generate speech using Coqui XTTS v2"""
|
| 131 |
try:
|
| 132 |
+
if self.tts is None:
|
| 133 |
+
print("[TTS] β Model not initialized")
|
| 134 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
+
print(f"[TTS] π§ Generating with {self.voice_name} ({self.language})...")
|
| 137 |
start = time.time()
|
| 138 |
|
| 139 |
+
# XTTS v2 Generation
|
| 140 |
+
# We use the named speaker directly.
|
| 141 |
+
# Note: XTTS usually does not support 'speed' arg in tts_to_file directly.
|
| 142 |
+
self.tts.tts_to_file(
|
| 143 |
+
text=text,
|
| 144 |
+
file_path=filename,
|
| 145 |
+
speaker=self.voice_name,
|
| 146 |
+
language=self.language,
|
| 147 |
+
split_sentences=True
|
| 148 |
+
)
|
| 149 |
|
| 150 |
gen_time = time.time() - start
|
| 151 |
print(f"[TTS] β
Generated in {gen_time:.2f}s")
|
|
|
|
| 183 |
|
| 184 |
print(f"\n[TTS] π Speaking ({self.language}): '{text[:80]}...'")
|
| 185 |
|
| 186 |
+
# Pause workers (listening)
|
| 187 |
for worker in self.audio_workers:
|
| 188 |
if hasattr(worker, 'pause_listening'):
|
| 189 |
try:
|
|
|
|
| 213 |
|
| 214 |
time.sleep(0.2)
|
| 215 |
|
| 216 |
+
# Resume workers
|
| 217 |
for worker in self.audio_workers:
|
| 218 |
if hasattr(worker, 'resume_listening'):
|
| 219 |
try:
|
requirements_docker.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# MrrrMe Backend Requirements -
|
| 2 |
# Core frameworks
|
| 3 |
fastapi==0.115.4
|
| 4 |
uvicorn[standard]==0.32.0
|
|
@@ -14,7 +14,7 @@ timm==1.0.11
|
|
| 14 |
einops==0.8.0
|
| 15 |
|
| 16 |
# ML/DL
|
| 17 |
-
# NOTE: torch
|
| 18 |
numpy==1.26.4
|
| 19 |
scipy==1.13.1
|
| 20 |
pandas==2.2.3
|
|
@@ -31,10 +31,10 @@ pydub==0.25.1
|
|
| 31 |
sounddevice==0.5.1
|
| 32 |
webrtcvad==2.0.10
|
| 33 |
|
| 34 |
-
# Text-to-Speech
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
|
| 39 |
# Text processing
|
| 40 |
nltk==3.9.1
|
|
|
|
| 1 |
+
# MrrrMe Backend Requirements - Coqui XTTS v2
|
| 2 |
# Core frameworks
|
| 3 |
fastapi==0.115.4
|
| 4 |
uvicorn[standard]==0.32.0
|
|
|
|
| 14 |
einops==0.8.0
|
| 15 |
|
| 16 |
# ML/DL
|
| 17 |
+
# NOTE: torch, torchvision, torchaudio are installed in Dockerfile with CUDA support
|
| 18 |
numpy==1.26.4
|
| 19 |
scipy==1.13.1
|
| 20 |
pandas==2.2.3
|
|
|
|
| 31 |
sounddevice==0.5.1
|
| 32 |
webrtcvad==2.0.10
|
| 33 |
|
| 34 |
+
# Text-to-Speech
|
| 35 |
+
coqui-tts==0.24.2
|
| 36 |
+
# Note: coqui-tts pulls in a lot of deps, but we constrain torch in Dockerfile.
|
| 37 |
+
# If build fails on conflicts, remove the version constraint here.
|
| 38 |
|
| 39 |
# Text processing
|
| 40 |
nltk==3.9.1
|