michon commited on
Commit
6b5d3ca
Β·
1 Parent(s): 9d3aa21
Dockerfile CHANGED
@@ -1,9 +1,7 @@
1
- # Hugging Face Spaces - MrrrMe Emotion AI
2
- # All-in-one container with PERSISTENT DATABASE SUPPORT
3
-
4
  FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
5
 
6
- # Install system dependencies + Dev Mode requirements
7
  RUN apt-get update && apt-get install -y \
8
  bash \
9
  git \
@@ -29,28 +27,45 @@ RUN apt-get update && apt-get install -y \
29
  unzip \
30
  && rm -rf /var/lib/apt/lists/*
31
 
32
- # Install Node.js 20 (required for Next.js 16)
33
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
34
  apt-get install -y nodejs && \
35
  rm -rf /var/lib/apt/lists/*
36
 
37
- # Install Rhubarb Lip Sync with ALL resources
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  RUN wget https://github.com/DanielSWolf/rhubarb-lip-sync/releases/download/v1.13.0/Rhubarb-Lip-Sync-1.13.0-Linux.zip -O /tmp/rhubarb.zip && \
39
  unzip /tmp/rhubarb.zip -d /tmp && \
40
  mkdir -p /opt/rhubarb && \
41
  cp -r /tmp/Rhubarb-Lip-Sync-1.13.0-Linux/* /opt/rhubarb/ && \
42
  ln -s /opt/rhubarb/rhubarb /usr/local/bin/rhubarb && \
43
  chmod +x /opt/rhubarb/rhubarb && \
44
- rm -rf /tmp/rhubarb* && \
45
- echo "βœ… Rhubarb Lip Sync installed with resources"
46
 
47
  # Set Python 3.11 as default
48
- RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
49
- RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
50
 
51
  WORKDIR /app
52
 
53
- # Install PyTorch with CUDA 11.8 support
54
  RUN python3.11 -m pip install --no-cache-dir \
55
  torch==2.4.0 \
56
  torchvision==0.19.0 \
@@ -69,7 +84,7 @@ RUN PIP_CONSTRAINT=/tmp/constraints.txt python3.11 -m pip install --no-cache-dir
69
 
70
  # Install avatar dependencies
71
  RUN python3.11 -m pip install --no-cache-dir \
72
- fastapi uvicorn python-multipart edge-tts pydub websockets
73
 
74
  # Copy application code
75
  COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
@@ -80,7 +95,7 @@ COPY --link --chown=1000:1000 weights/ ./weights/
80
  # Create directories
81
  RUN mkdir -p /app/weights /app/avatar/static
82
 
83
- # Fix openface bug - single line version
84
  RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n open(fp,'w').write('import cv2\\\\n'+c)\\n print('Patched')\")"
85
 
86
  # Build frontend
@@ -91,7 +106,7 @@ RUN npm ci
91
  COPY --link --chown=1000:1000 avatar-frontend/ ./
92
  RUN npm run build
93
 
94
- # Copy static files to standalone
95
  RUN cp -r .next/static .next/standalone/.next/ && \
96
  cp -r public .next/standalone/ 2>/dev/null || true
97
 
@@ -109,16 +124,13 @@ RUN mkdir -p /etc/nginx/certs && \
109
  -subj "/CN=mrrrme.hf.space"
110
 
111
  # Create startup script
112
- RUN printf '#!/bin/bash\nset -e\n\n# Configure git for Dev Mode\nexport HOME=/tmp\nmkdir -p /tmp\ngit config --global user.name "michon" 2>/dev/null || true\ngit config --global user.email "[email protected]" 2>/dev/null || true\n\n# Check persistent storage\nif [ -d "/data" ] && [ -w "/data" ]; then\n echo "πŸ“ Persistent storage: /data (survives rebuilds)"\n chmod 777 /data 2>/dev/null || true\nelse\n echo "⚠️ Ephemeral storage: /tmp (lost on rebuild)"\n echo "Enable persistent storage in Space Settings!"\nfi\n\n# Kill any existing processes\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\nsleep 2\n\necho "Starting MrrrMe..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\n\n# Keep container alive\nwait\n' > /app/start.sh && chmod +x /app/start.sh
113
 
114
- # Set ownership of entire /app to user 1000
115
  RUN chown -R 1000:1000 /app
116
 
117
- # CRITICAL: Switch to user 1000 for Dev Mode
118
  USER 1000
119
 
120
- # Expose Hugging Face Spaces port
121
  EXPOSE 7860
122
 
123
- # Start all services
124
  CMD ["/app/start.sh"]
 
1
+ # Hugging Face Spaces - MrrrMe Emotion AI with PIPER TTS
 
 
2
  FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
 
4
+ # Install system dependencies
5
  RUN apt-get update && apt-get install -y \
6
  bash \
7
  git \
 
27
  unzip \
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
+ # Install Node.js 20
31
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
32
  apt-get install -y nodejs && \
33
  rm -rf /var/lib/apt/lists/*
34
 
35
+ # Install Piper TTS
36
+ RUN wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz -O /tmp/piper.tar.gz && \
37
+ tar -xzf /tmp/piper.tar.gz -C /opt && \
38
+ ln -s /opt/piper/piper /usr/local/bin/piper && \
39
+ chmod +x /opt/piper/piper && \
40
+ rm /tmp/piper.tar.gz
41
+
42
+ # Download Piper voices
43
+ RUN mkdir -p /opt/piper/voices && cd /opt/piper/voices && \
44
+ wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx && \
45
+ wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx.json && \
46
+ wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ryan/high/en_US-ryan-high.onnx && \
47
+ wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ryan/high/en_US-ryan-high.onnx.json && \
48
+ wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx && \
49
+ wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx.json
50
+
51
+ ENV PIPER_VOICES_DIR=/opt/piper/voices
52
+
53
+ # Install Rhubarb Lip Sync
54
  RUN wget https://github.com/DanielSWolf/rhubarb-lip-sync/releases/download/v1.13.0/Rhubarb-Lip-Sync-1.13.0-Linux.zip -O /tmp/rhubarb.zip && \
55
  unzip /tmp/rhubarb.zip -d /tmp && \
56
  mkdir -p /opt/rhubarb && \
57
  cp -r /tmp/Rhubarb-Lip-Sync-1.13.0-Linux/* /opt/rhubarb/ && \
58
  ln -s /opt/rhubarb/rhubarb /usr/local/bin/rhubarb && \
59
  chmod +x /opt/rhubarb/rhubarb && \
60
+ rm -rf /tmp/rhubarb*
 
61
 
62
  # Set Python 3.11 as default
63
+ RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
64
+ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
65
 
66
  WORKDIR /app
67
 
68
+ # Install PyTorch with CUDA 11.8
69
  RUN python3.11 -m pip install --no-cache-dir \
70
  torch==2.4.0 \
71
  torchvision==0.19.0 \
 
84
 
85
  # Install avatar dependencies
86
  RUN python3.11 -m pip install --no-cache-dir \
87
+ fastapi uvicorn python-multipart pydub websockets piper-tts onnxruntime
88
 
89
  # Copy application code
90
  COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
 
95
  # Create directories
96
  RUN mkdir -p /app/weights /app/avatar/static
97
 
98
+ # Fix openface bug
99
  RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n open(fp,'w').write('import cv2\\\\n'+c)\\n print('Patched')\")"
100
 
101
  # Build frontend
 
106
  COPY --link --chown=1000:1000 avatar-frontend/ ./
107
  RUN npm run build
108
 
109
+ # Copy static files
110
  RUN cp -r .next/static .next/standalone/.next/ && \
111
  cp -r public .next/standalone/ 2>/dev/null || true
112
 
 
124
  -subj "/CN=mrrrme.hf.space"
125
 
126
  # Create startup script
127
+ RUN printf '#!/bin/bash\nset -e\nexport HOME=/tmp\nmkdir -p /tmp\ngit config --global user.name "michon" 2>/dev/null || true\ngit config --global user.email "[email protected]" 2>/dev/null || true\nif [ -d "/data" ] && [ -w "/data" ]; then\n echo "Persistent storage: /data"\n chmod 777 /data 2>/dev/null || true\nelse\n echo "Ephemeral storage: /tmp"\nfi\nexport PIPER_VOICES_DIR=/opt/piper/voices\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\nsleep 2\necho "Starting MrrrMe..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
128
 
129
+ # Set ownership
130
  RUN chown -R 1000:1000 /app
131
 
 
132
  USER 1000
133
 
 
134
  EXPOSE 7860
135
 
 
136
  CMD ["/app/start.sh"]
avatar/speak_server.py CHANGED
@@ -1,10 +1,10 @@
1
- """Avatar Backend - EDGE TTS (FREE) + PATTERN-BASED LIP SYNC"""
2
  import os
3
  import json
4
  import uuid
5
  import time
6
- import asyncio
7
- import edge_tts
8
  from fastapi import FastAPI, Form, WebSocket
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.staticfiles import StaticFiles
@@ -18,19 +18,23 @@ load_dotenv()
18
  OUT_DIR = "/tmp/avatar_static"
19
  os.makedirs(OUT_DIR, exist_ok=True)
20
 
21
- # Edge TTS Voice Mapping
22
- # Use `edge-tts --list-voices` to see all options
 
23
  VOICE_MAP = {
24
  "en": {
25
- "female": "en-US-AriaNeural",
26
- "male": "en-US-GuyNeural"
27
  },
28
  "nl": {
29
- "female": "nl-NL-FennaNeural",
30
- "male": "nl-NL-MaartenNeural"
31
  }
32
  }
33
 
 
 
 
34
  app = FastAPI()
35
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
36
  app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
@@ -78,19 +82,42 @@ def text_to_visemes_simple(text: str, duration: float):
78
 
79
  return visemes
80
 
81
- async def generate_edge_tts(text: str, mp3_path: str, voice_name: str):
82
- """Generate audio using Edge TTS"""
83
  try:
84
- print(f"[TTS] πŸ”§ Generating with Edge TTS ({voice_name})...")
85
- communicate = edge_tts.Communicate(text, voice_name)
86
- await communicate.save(mp3_path)
 
 
 
 
 
 
 
 
 
 
87
 
88
- if os.path.exists(mp3_path) and os.path.getsize(mp3_path) > 0:
 
 
 
 
 
 
 
89
  return True
 
 
 
 
 
 
90
  return False
91
  except Exception as e:
92
- print(f"[TTS] ❌ Edge TTS error: {e}")
93
- raise e
94
 
95
  @app.websocket("/ws")
96
  async def websocket_endpoint(websocket: WebSocket):
@@ -118,33 +145,48 @@ async def broadcast_to_avatars(data: dict):
118
  async def speak(text: str = Form(...), voice: str = Form("female"), language: str = Form("en")):
119
  t_start = time.time()
120
  uid = uuid.uuid4().hex[:8]
 
121
  mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
122
 
123
  # Select voice based on preferences
124
- # Default to 'en' and 'female' if invalid keys provided
125
  lang_map = VOICE_MAP.get(language, VOICE_MAP["en"])
126
  selected_voice = lang_map.get(voice, lang_map["female"])
127
 
128
  print(f"\n{'='*60}")
129
- print(f"[Backend] [{time.strftime('%H:%M:%S')}] EDGE TTS GENERATION")
130
  print(f"[Backend] Text: '{text}'")
131
  print(f"[Backend] Params: voice={voice}, lang={language} -> {selected_voice}")
132
 
133
  try:
134
- # Step 1: Generate TTS
135
  t1 = time.time()
136
- await generate_edge_tts(text, mp3_path, selected_voice)
 
 
 
 
137
  t2 = time.time()
138
  print(f"[Backend] [+{t2-t_start:.2f}s] Audio generated ({t2-t1:.2f}s)")
139
 
140
- # Step 2: Get audio duration for lip sync
141
  try:
142
- audio = AudioSegment.from_file(mp3_path)
 
143
  duration_sec = len(audio) / 1000.0
144
- except Exception as e:
145
- print(f"[Backend] ⚠️ Could not read audio duration: {e}")
146
- # Fallback estimation: roughly 15 chars per second
147
- duration_sec = max(1.5, len(text) / 15.0)
 
 
 
 
 
 
 
 
 
 
148
 
149
  # Step 3: Generate visemes
150
  t3 = time.time()
@@ -154,7 +196,7 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
154
  print(f"[Backend] [+{t4-t_start:.2f}s] Visemes generated: {len(visemes)}")
155
 
156
  response_data = {
157
- "audio_url": f"/static/{os.path.basename(mp3_path)}",
158
  "visemes": visemes,
159
  "duration": duration_sec,
160
  "text": text
@@ -171,7 +213,19 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
171
  content={"error": error_msg, "text": text}
172
  )
173
 
 
 
 
 
 
 
 
 
 
174
  if __name__ == "__main__":
175
  import uvicorn
176
- print("πŸš€ Edge TTS Server starting on port 8765...")
 
 
 
177
  uvicorn.run(app, host="0.0.0.0", port=8765)
 
1
+ """Avatar Backend - PIPER TTS (LOCAL & FREE) + PATTERN-BASED LIP SYNC"""
2
  import os
3
  import json
4
  import uuid
5
  import time
6
+ import subprocess
7
+ import wave
8
  from fastapi import FastAPI, Form, WebSocket
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi.staticfiles import StaticFiles
 
18
  OUT_DIR = "/tmp/avatar_static"
19
  os.makedirs(OUT_DIR, exist_ok=True)
20
 
21
+ # Piper Voice Mapping
22
+ # Download voices from: https://github.com/rhasspy/piper/releases
23
+ # Or use: piper --download-dir ./voices --list-voices
24
  VOICE_MAP = {
25
  "en": {
26
+ "female": "en_US-amy-medium", # Clear, friendly female voice
27
+ "male": "en_US-ryan-high" # Clear, friendly male voice
28
  },
29
  "nl": {
30
+ "female": "nl_NL-mls-medium", # Dutch female
31
+ "male": "nl_BE-rdh-medium" # Belgian Dutch male
32
  }
33
  }
34
 
35
+ # Piper executable path (adjust if needed)
36
+ PIPER_PATH = "piper" # Assumes piper is in PATH
37
+
38
  app = FastAPI()
39
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
40
  app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
 
82
 
83
  return visemes
84
 
85
+ def generate_piper_tts(text: str, wav_path: str, voice_name: str):
86
+ """Generate audio using Piper TTS"""
87
  try:
88
+ print(f"[TTS] πŸ”§ Generating with Piper TTS ({voice_name})...")
89
+
90
+ # Piper command: echo "text" | piper --model voice_name --output_file output.wav
91
+ # Or: piper --model voice_name --output_file output.wav < input.txt
92
+
93
+ # Method 1: Using subprocess with pipe
94
+ process = subprocess.Popen(
95
+ [PIPER_PATH, "--model", voice_name, "--output_file", wav_path],
96
+ stdin=subprocess.PIPE,
97
+ stdout=subprocess.PIPE,
98
+ stderr=subprocess.PIPE,
99
+ text=True
100
+ )
101
 
102
+ stdout, stderr = process.communicate(input=text, timeout=30)
103
+
104
+ if process.returncode != 0:
105
+ print(f"[TTS] ❌ Piper error: {stderr}")
106
+ return False
107
+
108
+ if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
109
+ print(f"[TTS] βœ… Generated: {wav_path}")
110
  return True
111
+
112
+ return False
113
+
114
+ except subprocess.TimeoutExpired:
115
+ print(f"[TTS] ❌ Piper timeout")
116
+ process.kill()
117
  return False
118
  except Exception as e:
119
+ print(f"[TTS] ❌ Piper error: {e}")
120
+ return False
121
 
122
  @app.websocket("/ws")
123
  async def websocket_endpoint(websocket: WebSocket):
 
145
  async def speak(text: str = Form(...), voice: str = Form("female"), language: str = Form("en")):
146
  t_start = time.time()
147
  uid = uuid.uuid4().hex[:8]
148
+ wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
149
  mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
150
 
151
  # Select voice based on preferences
 
152
  lang_map = VOICE_MAP.get(language, VOICE_MAP["en"])
153
  selected_voice = lang_map.get(voice, lang_map["female"])
154
 
155
  print(f"\n{'='*60}")
156
+ print(f"[Backend] [{time.strftime('%H:%M:%S')}] PIPER TTS GENERATION")
157
  print(f"[Backend] Text: '{text}'")
158
  print(f"[Backend] Params: voice={voice}, lang={language} -> {selected_voice}")
159
 
160
  try:
161
+ # Step 1: Generate TTS with Piper
162
  t1 = time.time()
163
+ success = generate_piper_tts(text, wav_path, selected_voice)
164
+
165
+ if not success:
166
+ raise Exception("Piper TTS generation failed")
167
+
168
  t2 = time.time()
169
  print(f"[Backend] [+{t2-t_start:.2f}s] Audio generated ({t2-t1:.2f}s)")
170
 
171
+ # Step 2: Convert WAV to MP3 (optional, for better web compatibility)
172
  try:
173
+ audio = AudioSegment.from_wav(wav_path)
174
+ audio.export(mp3_path, format="mp3", bitrate="128k")
175
  duration_sec = len(audio) / 1000.0
176
+
177
+ # Clean up WAV file
178
+ os.remove(wav_path)
179
+
180
+ audio_file = mp3_path
181
+ print(f"[Backend] βœ… Converted to MP3")
182
+ except Exception as conv_err:
183
+ print(f"[Backend] ⚠️ MP3 conversion failed, using WAV: {conv_err}")
184
+ # Read duration from WAV
185
+ with wave.open(wav_path, 'rb') as wf:
186
+ frames = wf.getnframes()
187
+ rate = wf.getframerate()
188
+ duration_sec = frames / float(rate)
189
+ audio_file = wav_path
190
 
191
  # Step 3: Generate visemes
192
  t3 = time.time()
 
196
  print(f"[Backend] [+{t4-t_start:.2f}s] Visemes generated: {len(visemes)}")
197
 
198
  response_data = {
199
+ "audio_url": f"/static/{os.path.basename(audio_file)}",
200
  "visemes": visemes,
201
  "duration": duration_sec,
202
  "text": text
 
213
  content={"error": error_msg, "text": text}
214
  )
215
 
216
+ @app.get("/")
217
+ async def root():
218
+ """Health check endpoint"""
219
+ return {
220
+ "status": "running",
221
+ "tts_engine": "piper",
222
+ "voices": VOICE_MAP
223
+ }
224
+
225
  if __name__ == "__main__":
226
  import uvicorn
227
+ print("πŸš€ Piper TTS Server starting on port 8765...")
228
+ print("πŸ“¦ Make sure Piper voices are installed!")
229
+ print(" Download from: https://github.com/rhasspy/piper/releases")
230
+ print(" Or use: piper --download-dir ./voices --list-voices")
231
  uvicorn.run(app, host="0.0.0.0", port=8765)
mrrrme/audio/voice_assistant.py CHANGED
@@ -1,55 +1,76 @@
1
- """Text-to-Speech using Edge TTS (FREE & High Quality) - DOCKER/HF SPACES VERSION"""
2
  import os
3
  import time
4
  import tempfile
5
  import threading
6
- import asyncio
7
  import pygame
8
- import edge_tts
9
  from dotenv import load_dotenv
10
 
11
  load_dotenv()
12
 
13
  # ========== Configuration ==========
14
 
15
- # Edge TTS Voices (Free, High Quality)
16
- # Check `edge-tts --list-voices` for more
17
  VOICE_MAP = {
18
- "female": "en-US-AriaNeural", # Standard Friendly Female
19
- "male": "en-US-GuyNeural", # Standard Friendly Male
20
- "Happy": "en-US-AriaNeural",
21
- "Sad": "en-US-AriaNeural",
22
- "Angry": "en-US-AriaNeural",
23
- "Neutral": "en-US-AriaNeural",
24
  }
25
 
 
 
 
26
  class VoiceAssistant:
27
  """
28
- Edge TTS (Free Microsoft Azure Voices)
29
- High quality, no account required, unlimited free usage.
 
 
 
30
  """
31
 
32
- def __init__(self, voice: str = "female", rate: str = "+0%"):
33
  self.voice_key = voice
34
- self.voice_name = VOICE_MAP.get(voice, "en-US-AriaNeural")
35
- self.rate = rate
36
- self.pitch = "+0Hz"
37
 
38
  self.counter = 0
39
  self.is_speaking = False
40
  self.speaking_lock = threading.Lock()
41
  self.audio_workers = []
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Init pygame for playback
44
  print("[TTS] πŸ”§ Initializing pygame...")
45
  try:
46
  pygame.mixer.quit()
47
- pygame.mixer.init(frequency=24000, size=-16, channels=1, buffer=2048)
48
  print(f"[TTS] βœ… Pygame ready")
49
  except Exception as e:
50
  print(f"[TTS] ⚠️ Pygame warning: {e}")
51
 
52
- print(f"[TTS] βœ… Ready with Edge TTS ({self.voice_name})!\n")
53
 
54
  def register_audio_worker(self, worker):
55
  self.audio_workers.append(worker)
@@ -60,15 +81,28 @@ class VoiceAssistant:
60
  if voice_key in VOICE_MAP:
61
  self.voice_name = VOICE_MAP[voice_key]
62
  self.voice_key = voice_key
63
- print(f"[TTS] πŸŽ™οΈ voice β†’ {self.voice_name}")
64
- elif voice_key.startswith("en-") or voice_key.startswith("nl-"):
65
- self.voice_name = voice_key # Allow direct setting
 
66
 
67
- def set_rate(self, rate: str):
68
- self.rate = rate
 
 
69
 
70
  def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
71
- pass
 
 
 
 
 
 
 
 
 
 
72
 
73
  def stop(self):
74
  print("[TTS] πŸ›‘ STOP")
@@ -83,36 +117,58 @@ class VoiceAssistant:
83
 
84
  for worker in self.audio_workers:
85
  if hasattr(worker, 'resume_listening'):
86
- try: worker.resume_listening()
87
- except: pass
 
 
88
 
89
- def _get_unique_filename(self, ext: str = ".mp3"):
90
  self.counter += 1
91
  return os.path.join(
92
  tempfile.gettempdir(),
93
- f"edge_{self.counter}_{int(time.time() * 1000)}{ext}"
94
  )
95
 
96
- async def _generate_speech_async(self, text: str, filename: str):
97
- """Async generation function"""
98
- communicate = edge_tts.Communicate(text, self.voice_name, rate=self.rate, pitch=self.pitch)
99
- await communicate.save(filename)
100
-
101
  def _generate_speech(self, text: str, filename: str):
 
102
  try:
103
- print(f"[TTS] πŸ”§ Generating with Edge TTS...")
104
  start = time.time()
105
 
106
- # Run async function in synchronous wrapper
107
- asyncio.run(self._generate_speech_async(text, filename))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  gen_time = time.time() - start
110
 
111
- if os.path.exists(filename) and os.path.getsize(filename) > 0:
112
  print(f"[TTS] βœ… Generated in {gen_time:.2f}s")
113
  return True
114
- return False
 
 
115
 
 
 
 
 
116
  except Exception as e:
117
  print(f"[TTS] ❌ Error: {e}")
118
  return False
@@ -146,13 +202,15 @@ class VoiceAssistant:
146
  # Pause workers
147
  for worker in self.audio_workers:
148
  if hasattr(worker, 'pause_listening'):
149
- try: worker.pause_listening()
150
- except: pass
 
 
151
 
152
  with self.speaking_lock:
153
  self.is_speaking = True
154
 
155
- temp_file = self._get_unique_filename(".mp3")
156
 
157
  try:
158
  if self._generate_speech(text, temp_file):
@@ -161,7 +219,8 @@ class VoiceAssistant:
161
  try:
162
  if os.path.exists(temp_file):
163
  os.remove(temp_file)
164
- except: pass
 
165
 
166
  except Exception as e:
167
  print(f"[TTS] ❌ Error: {e}")
@@ -174,8 +233,10 @@ class VoiceAssistant:
174
  # Resume workers
175
  for worker in self.audio_workers:
176
  if hasattr(worker, 'resume_listening'):
177
- try: worker.resume_listening()
178
- except: pass
 
 
179
 
180
  def speak_async(self, text: str):
181
  threading.Thread(target=self.speak, args=(text,), daemon=True).start()
 
1
+ """Text-to-Speech using Piper TTS (FREE, LOCAL & FAST) - DOCKER/HF SPACES VERSION"""
2
  import os
3
  import time
4
  import tempfile
5
  import threading
6
+ import subprocess
7
  import pygame
8
+ from pydub import AudioSegment
9
  from dotenv import load_dotenv
10
 
11
  load_dotenv()
12
 
13
  # ========== Configuration ==========
14
 
15
+ # Piper TTS Voices (Local, High Quality, FAST)
16
+ # Download voices from: https://github.com/rhasspy/piper/releases/tag/v1.2.0
17
  VOICE_MAP = {
18
+ "female": "en_US-amy-medium", # Clear, natural female voice
19
+ "male": "en_US-ryan-high", # Clear, natural male voice
20
+ "Happy": "en_US-amy-medium",
21
+ "Sad": "en_US-amy-low",
22
+ "Angry": "en_US-joe-medium",
23
+ "Neutral": "en_US-amy-medium",
24
  }
25
 
26
+ # Piper executable path
27
+ PIPER_PATH = os.environ.get("PIPER_PATH", "piper")
28
+
29
  class VoiceAssistant:
30
  """
31
+ Piper TTS (Local Neural TTS)
32
+ - 100% offline
33
+ - Real-time generation
34
+ - High quality voices
35
+ - No API keys required
36
  """
37
 
38
+ def __init__(self, voice: str = "female", rate: float = 1.0):
39
  self.voice_key = voice
40
+ self.voice_name = VOICE_MAP.get(voice, "en_US-amy-medium")
41
+ self.rate = rate # Speech rate multiplier (0.5-2.0)
 
42
 
43
  self.counter = 0
44
  self.is_speaking = False
45
  self.speaking_lock = threading.Lock()
46
  self.audio_workers = []
47
 
48
+ # Check if Piper is installed
49
+ try:
50
+ result = subprocess.run(
51
+ [PIPER_PATH, "--version"],
52
+ capture_output=True,
53
+ text=True,
54
+ timeout=5
55
+ )
56
+ print(f"[TTS] βœ… Piper version: {result.stdout.strip()}")
57
+ except FileNotFoundError:
58
+ print(f"[TTS] ❌ Piper not found! Install with: pip install piper-tts")
59
+ print(f"[TTS] Or download from: https://github.com/rhasspy/piper/releases")
60
+ raise
61
+ except Exception as e:
62
+ print(f"[TTS] ⚠️ Piper check failed: {e}")
63
+
64
  # Init pygame for playback
65
  print("[TTS] πŸ”§ Initializing pygame...")
66
  try:
67
  pygame.mixer.quit()
68
+ pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
69
  print(f"[TTS] βœ… Pygame ready")
70
  except Exception as e:
71
  print(f"[TTS] ⚠️ Pygame warning: {e}")
72
 
73
+ print(f"[TTS] βœ… Ready with Piper TTS ({self.voice_name})!\n")
74
 
75
  def register_audio_worker(self, worker):
76
  self.audio_workers.append(worker)
 
81
  if voice_key in VOICE_MAP:
82
  self.voice_name = VOICE_MAP[voice_key]
83
  self.voice_key = voice_key
84
+ print(f"[TTS] πŸŽ™οΈ Voice β†’ {self.voice_name}")
85
+ else:
86
+ # Allow direct voice name setting
87
+ self.voice_name = voice_key
88
 
89
+ def set_rate(self, rate: float):
90
+ """Set speech rate (0.5 = slow, 1.0 = normal, 2.0 = fast)"""
91
+ self.rate = max(0.5, min(2.0, rate))
92
+ print(f"[TTS] 🎚️ Rate β†’ {self.rate}x")
93
 
94
  def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
95
+ """Apply emotion-specific voice settings"""
96
+ # Piper doesn't have built-in emotion control
97
+ # But we can adjust rate and potentially voice
98
+ if emotion == "Happy":
99
+ self.rate = 1.1 # Slightly faster
100
+ elif emotion == "Sad":
101
+ self.rate = 0.9 # Slightly slower
102
+ elif emotion == "Angry":
103
+ self.rate = 1.2 # Faster
104
+ else: # Neutral
105
+ self.rate = 1.0
106
 
107
  def stop(self):
108
  print("[TTS] πŸ›‘ STOP")
 
117
 
118
  for worker in self.audio_workers:
119
  if hasattr(worker, 'resume_listening'):
120
+ try:
121
+ worker.resume_listening()
122
+ except:
123
+ pass
124
 
125
+ def _get_unique_filename(self, ext: str = ".wav"):
126
  self.counter += 1
127
  return os.path.join(
128
  tempfile.gettempdir(),
129
+ f"piper_{self.counter}_{int(time.time() * 1000)}{ext}"
130
  )
131
 
 
 
 
 
 
132
  def _generate_speech(self, text: str, filename: str):
133
+ """Generate speech using Piper TTS"""
134
  try:
135
+ print(f"[TTS] πŸ”§ Generating with Piper...")
136
  start = time.time()
137
 
138
+ # Piper command: echo "text" | piper --model voice --output_file output.wav
139
+ cmd = [
140
+ PIPER_PATH,
141
+ "--model", self.voice_name,
142
+ "--output_file", filename
143
+ ]
144
+
145
+ # Add length scale (rate) if not 1.0
146
+ if self.rate != 1.0:
147
+ cmd.extend(["--length_scale", str(1.0 / self.rate)])
148
+
149
+ process = subprocess.Popen(
150
+ cmd,
151
+ stdin=subprocess.PIPE,
152
+ stdout=subprocess.PIPE,
153
+ stderr=subprocess.PIPE,
154
+ text=True
155
+ )
156
+
157
+ stdout, stderr = process.communicate(input=text, timeout=30)
158
 
159
  gen_time = time.time() - start
160
 
161
+ if process.returncode == 0 and os.path.exists(filename) and os.path.getsize(filename) > 0:
162
  print(f"[TTS] βœ… Generated in {gen_time:.2f}s")
163
  return True
164
+ else:
165
+ print(f"[TTS] ❌ Generation failed: {stderr}")
166
+ return False
167
 
168
+ except subprocess.TimeoutExpired:
169
+ print(f"[TTS] ❌ Timeout")
170
+ process.kill()
171
+ return False
172
  except Exception as e:
173
  print(f"[TTS] ❌ Error: {e}")
174
  return False
 
202
  # Pause workers
203
  for worker in self.audio_workers:
204
  if hasattr(worker, 'pause_listening'):
205
+ try:
206
+ worker.pause_listening()
207
+ except:
208
+ pass
209
 
210
  with self.speaking_lock:
211
  self.is_speaking = True
212
 
213
+ temp_file = self._get_unique_filename(".wav")
214
 
215
  try:
216
  if self._generate_speech(text, temp_file):
 
219
  try:
220
  if os.path.exists(temp_file):
221
  os.remove(temp_file)
222
+ except:
223
+ pass
224
 
225
  except Exception as e:
226
  print(f"[TTS] ❌ Error: {e}")
 
233
  # Resume workers
234
  for worker in self.audio_workers:
235
  if hasattr(worker, 'resume_listening'):
236
+ try:
237
+ worker.resume_listening()
238
+ except:
239
+ pass
240
 
241
  def speak_async(self, text: str):
242
  threading.Thread(target=self.speak, args=(text,), daemon=True).start()
requirements_docker.txt CHANGED
@@ -31,11 +31,11 @@ pyaudio==0.2.14
31
  pydub==0.25.1
32
  sounddevice==0.5.1
33
  webrtcvad==2.0.10
34
- # openai-whisper removed - using browser Speech Recognition API instead
35
 
36
- # Text-to-Speech
37
- elevenlabs==1.9.0
38
  pygame==2.6.1
 
39
 
40
  # Text processing
41
  nltk==3.9.1
 
31
  pydub==0.25.1
32
  sounddevice==0.5.1
33
  webrtcvad==2.0.10
 
34
 
35
+ # Text-to-Speech (PIPER - Local & Fast)
36
+ piper-tts==1.2.0
37
  pygame==2.6.1
38
+ onnxruntime==1.16.3
39
 
40
  # Text processing
41
  nltk==3.9.1