Spaces:

michon
/

mrrrme-emotion-ai

Sleeping

App Files Files Community

michon commited on 16 days ago

Commit

6b5d3ca

1 Parent(s): 9d3aa21

piper tts

Browse files

Files changed (4) hide show

Dockerfile +31 -19
avatar/speak_server.py +83 -29
mrrrme/audio/voice_assistant.py +106 -45
requirements_docker.txt +3 -3

Dockerfile CHANGED Viewed

@@ -1,9 +1,7 @@
-# Hugging Face Spaces - MrrrMe Emotion AI
-# All-in-one container with PERSISTENT DATABASE SUPPORT
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
-# Install system dependencies + Dev Mode requirements
 RUN apt-get update && apt-get install -y \
     bash \
     git \
@@ -29,28 +27,45 @@ RUN apt-get update && apt-get install -y \
     unzip \
     && rm -rf /var/lib/apt/lists/*
-# Install Node.js 20 (required for Next.js 16)
 RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
     apt-get install -y nodejs && \
     rm -rf /var/lib/apt/lists/*
-# Install Rhubarb Lip Sync with ALL resources
 RUN wget https://github.com/DanielSWolf/rhubarb-lip-sync/releases/download/v1.13.0/Rhubarb-Lip-Sync-1.13.0-Linux.zip -O /tmp/rhubarb.zip && \
     unzip /tmp/rhubarb.zip -d /tmp && \
     mkdir -p /opt/rhubarb && \
     cp -r /tmp/Rhubarb-Lip-Sync-1.13.0-Linux/* /opt/rhubarb/ && \
     ln -s /opt/rhubarb/rhubarb /usr/local/bin/rhubarb && \
     chmod +x /opt/rhubarb/rhubarb && \
-    rm -rf /tmp/rhubarb* && \
-    echo "✅ Rhubarb Lip Sync installed with resources"
 # Set Python 3.11 as default
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
-RUN update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
 WORKDIR /app
-# Install PyTorch with CUDA 11.8 support
 RUN python3.11 -m pip install --no-cache-dir \
     torch==2.4.0 \
     torchvision==0.19.0 \
@@ -69,7 +84,7 @@ RUN PIP_CONSTRAINT=/tmp/constraints.txt python3.11 -m pip install --no-cache-dir
 # Install avatar dependencies
 RUN python3.11 -m pip install --no-cache-dir \
-    fastapi uvicorn python-multipart edge-tts pydub websockets
 # Copy application code
 COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
@@ -80,7 +95,7 @@ COPY --link --chown=1000:1000 weights/ ./weights/
 # Create directories
 RUN mkdir -p /app/weights /app/avatar/static
-# Fix openface bug - single line version
 RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n    open(fp,'w').write('import cv2\\\\n'+c)\\n    print('Patched')\")"
 # Build frontend
@@ -91,7 +106,7 @@ RUN npm ci
 COPY --link --chown=1000:1000 avatar-frontend/ ./
 RUN npm run build
-# Copy static files to standalone
 RUN cp -r .next/static .next/standalone/.next/ && \
     cp -r public .next/standalone/ 2>/dev/null || true
@@ -109,16 +124,13 @@ RUN mkdir -p /etc/nginx/certs && \
     -subj "/CN=mrrrme.hf.space"
 # Create startup script
-RUN printf '#!/bin/bash\nset -e\n\n# Configure git for Dev Mode\nexport HOME=/tmp\nmkdir -p /tmp\ngit config --global user.name "michon" 2>/dev/null || true\ngit config --global user.email "[email protected]" 2>/dev/null || true\n\n# Check persistent storage\nif [ -d "/data" ] && [ -w "/data" ]; then\n    echo "📁 Persistent storage: /data (survives rebuilds)"\n    chmod 777 /data 2>/dev/null || true\nelse\n    echo "⚠️  Ephemeral storage: /tmp (lost on rebuild)"\n    echo "Enable persistent storage in Space Settings!"\nfi\n\n# Kill any existing processes\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\nsleep 2\n\necho "Starting MrrrMe..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\n\n# Keep container alive\nwait\n' > /app/start.sh && chmod +x /app/start.sh
-# Set ownership of entire /app to user 1000
 RUN chown -R 1000:1000 /app
-# CRITICAL: Switch to user 1000 for Dev Mode
 USER 1000
-# Expose Hugging Face Spaces port
 EXPOSE 7860
-# Start all services
 CMD ["/app/start.sh"]

+# Hugging Face Spaces - MrrrMe Emotion AI with PIPER TTS
 FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     bash \
     git \
     unzip \
     && rm -rf /var/lib/apt/lists/*
+# Install Node.js 20
 RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
     apt-get install -y nodejs && \
     rm -rf /var/lib/apt/lists/*
+# Install Piper TTS
+RUN wget https://github.com/rhasspy/piper/releases/download/v1.2.0/piper_amd64.tar.gz -O /tmp/piper.tar.gz && \
+    tar -xzf /tmp/piper.tar.gz -C /opt && \
+    ln -s /opt/piper/piper /usr/local/bin/piper && \
+    chmod +x /opt/piper/piper && \
+    rm /tmp/piper.tar.gz
+# Download Piper voices
+RUN mkdir -p /opt/piper/voices && cd /opt/piper/voices && \
+    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx && \
+    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/amy/medium/en_US-amy-medium.onnx.json && \
+    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ryan/high/en_US-ryan-high.onnx && \
+    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/en/en_US/ryan/high/en_US-ryan-high.onnx.json && \
+    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx && \
+    wget -q https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0/nl/nl_NL/mls/medium/nl_NL-mls-medium.onnx.json
+ENV PIPER_VOICES_DIR=/opt/piper/voices
+# Install Rhubarb Lip Sync
 RUN wget https://github.com/DanielSWolf/rhubarb-lip-sync/releases/download/v1.13.0/Rhubarb-Lip-Sync-1.13.0-Linux.zip -O /tmp/rhubarb.zip && \
     unzip /tmp/rhubarb.zip -d /tmp && \
     mkdir -p /opt/rhubarb && \
     cp -r /tmp/Rhubarb-Lip-Sync-1.13.0-Linux/* /opt/rhubarb/ && \
     ln -s /opt/rhubarb/rhubarb /usr/local/bin/rhubarb && \
     chmod +x /opt/rhubarb/rhubarb && \
+    rm -rf /tmp/rhubarb*
 # Set Python 3.11 as default
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
+    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
 WORKDIR /app
+# Install PyTorch with CUDA 11.8
 RUN python3.11 -m pip install --no-cache-dir \
     torch==2.4.0 \
     torchvision==0.19.0 \
 # Install avatar dependencies
 RUN python3.11 -m pip install --no-cache-dir \
+    fastapi uvicorn python-multipart pydub websockets piper-tts onnxruntime
 # Copy application code
 COPY --link --chown=1000:1000 mrrrme/ ./mrrrme/
 # Create directories
 RUN mkdir -p /app/weights /app/avatar/static
+# Fix openface bug
 RUN python3.11 -c "import os; fp='/usr/local/lib/python3.11/dist-packages/openface/multitask_model.py'; c=open(fp).read() if os.path.exists(fp) else ''; exec(\"if os.path.exists(fp) and 'import cv2' not in c:\\n    open(fp,'w').write('import cv2\\\\n'+c)\\n    print('Patched')\")"
 # Build frontend
 COPY --link --chown=1000:1000 avatar-frontend/ ./
 RUN npm run build
+# Copy static files
 RUN cp -r .next/static .next/standalone/.next/ && \
     cp -r public .next/standalone/ 2>/dev/null || true
     -subj "/CN=mrrrme.hf.space"
 # Create startup script
+RUN printf '#!/bin/bash\nset -e\nexport HOME=/tmp\nmkdir -p /tmp\ngit config --global user.name "michon" 2>/dev/null || true\ngit config --global user.email "[email protected]" 2>/dev/null || true\nif [ -d "/data" ] && [ -w "/data" ]; then\n    echo "Persistent storage: /data"\n    chmod 777 /data 2>/dev/null || true\nelse\n    echo "Ephemeral storage: /tmp"\nfi\nexport PIPER_VOICES_DIR=/opt/piper/voices\npkill -f "mrrrme.backend_server" 2>/dev/null || true\npkill -f "speak_server.py" 2>/dev/null || true\npkill -f "node server.js" 2>/dev/null || true\npkill -f "nginx" 2>/dev/null || true\nsleep 2\necho "Starting MrrrMe..."\ncd /app && python3.11 -m mrrrme.backend_server &\ncd /app/avatar && python3.11 speak_server.py &\ncd /app/frontend/.next/standalone && HOSTNAME=0.0.0.0 PORT=3001 node server.js &\nsleep 10\nnginx -g "daemon off;" &\necho "Ready!"\nwait\n' > /app/start.sh && chmod +x /app/start.sh
+# Set ownership
 RUN chown -R 1000:1000 /app
 USER 1000
 EXPOSE 7860
 CMD ["/app/start.sh"]

avatar/speak_server.py CHANGED Viewed

@@ -1,10 +1,10 @@
-"""Avatar Backend - EDGE TTS (FREE) + PATTERN-BASED LIP SYNC"""
 import os
 import json
 import uuid
 import time
-import asyncio
-import edge_tts
 from fastapi import FastAPI, Form, WebSocket
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
@@ -18,19 +18,23 @@ load_dotenv()
 OUT_DIR = "/tmp/avatar_static"
 os.makedirs(OUT_DIR, exist_ok=True)
-# Edge TTS Voice Mapping
-# Use `edge-tts --list-voices` to see all options
 VOICE_MAP = {
     "en": {
-        "female": "en-US-AriaNeural",
-        "male": "en-US-GuyNeural"
     },
     "nl": {
-        "female": "nl-NL-FennaNeural",
-        "male": "nl-NL-MaartenNeural"
     }
 }
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
@@ -78,19 +82,42 @@ def text_to_visemes_simple(text: str, duration: float):
     return visemes
-async def generate_edge_tts(text: str, mp3_path: str, voice_name: str):
-    """Generate audio using Edge TTS"""
     try:
-        print(f"[TTS] 🔧 Generating with Edge TTS ({voice_name})...")
-        communicate = edge_tts.Communicate(text, voice_name)
-        await communicate.save(mp3_path)
-        if os.path.exists(mp3_path) and os.path.getsize(mp3_path) > 0:
             return True
         return False
     except Exception as e:
-        print(f"[TTS] ❌ Edge TTS error: {e}")
-        raise e
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
@@ -118,33 +145,48 @@ async def broadcast_to_avatars(data: dict):
 async def speak(text: str = Form(...), voice: str = Form("female"), language: str = Form("en")):
     t_start = time.time()
     uid = uuid.uuid4().hex[:8]
     mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
     # Select voice based on preferences
-    # Default to 'en' and 'female' if invalid keys provided
     lang_map = VOICE_MAP.get(language, VOICE_MAP["en"])
     selected_voice = lang_map.get(voice, lang_map["female"])
     print(f"\n{'='*60}")
-    print(f"[Backend] [{time.strftime('%H:%M:%S')}] EDGE TTS GENERATION")
     print(f"[Backend] Text: '{text}'")
     print(f"[Backend] Params: voice={voice}, lang={language} -> {selected_voice}")
     try:
-        # Step 1: Generate TTS
         t1 = time.time()
-        await generate_edge_tts(text, mp3_path, selected_voice)
         t2 = time.time()
         print(f"[Backend] [+{t2-t_start:.2f}s] Audio generated ({t2-t1:.2f}s)")
-        # Step 2: Get audio duration for lip sync
         try:
-            audio = AudioSegment.from_file(mp3_path)
             duration_sec = len(audio) / 1000.0
-        except Exception as e:
-            print(f"[Backend] ⚠️ Could not read audio duration: {e}")
-            # Fallback estimation: roughly 15 chars per second
-            duration_sec = max(1.5, len(text) / 15.0)
         # Step 3: Generate visemes
         t3 = time.time()
@@ -154,7 +196,7 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
         print(f"[Backend] [+{t4-t_start:.2f}s] Visemes generated: {len(visemes)}")
         response_data = {
-            "audio_url": f"/static/{os.path.basename(mp3_path)}",
             "visemes": visemes,
             "duration": duration_sec,
             "text": text
@@ -171,7 +213,19 @@ async def speak(text: str = Form(...), voice: str = Form("female"), language: st
             content={"error": error_msg, "text": text}
         )
 if __name__ == "__main__":
     import uvicorn
-    print("🚀 Edge TTS Server starting on port 8765...")
     uvicorn.run(app, host="0.0.0.0", port=8765)

+"""Avatar Backend - PIPER TTS (LOCAL & FREE) + PATTERN-BASED LIP SYNC"""
 import os
 import json
 import uuid
 import time
+import subprocess
+import wave
 from fastapi import FastAPI, Form, WebSocket
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 OUT_DIR = "/tmp/avatar_static"
 os.makedirs(OUT_DIR, exist_ok=True)
+# Piper Voice Mapping
+# Download voices from: https://github.com/rhasspy/piper/releases
+# Or use: piper --download-dir ./voices --list-voices
 VOICE_MAP = {
     "en": {
+        "female": "en_US-amy-medium",  # Clear, friendly female voice
+        "male": "en_US-ryan-high"      # Clear, friendly male voice
     },
     "nl": {
+        "female": "nl_NL-mls-medium",  # Dutch female
+        "male": "nl_BE-rdh-medium"     # Belgian Dutch male
     }
 }
+# Piper executable path (adjust if needed)
+PIPER_PATH = "piper"  # Assumes piper is in PATH
 app = FastAPI()
 app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
     return visemes
+def generate_piper_tts(text: str, wav_path: str, voice_name: str):
+    """Generate audio using Piper TTS"""
     try:
+        print(f"[TTS] 🔧 Generating with Piper TTS ({voice_name})...")
+        # Piper command: echo "text" | piper --model voice_name --output_file output.wav
+        # Or: piper --model voice_name --output_file output.wav < input.txt
+        # Method 1: Using subprocess with pipe
+        process = subprocess.Popen(
+            [PIPER_PATH, "--model", voice_name, "--output_file", wav_path],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        stdout, stderr = process.communicate(input=text, timeout=30)
+        if process.returncode != 0:
+            print(f"[TTS] ❌ Piper error: {stderr}")
+            return False
+        if os.path.exists(wav_path) and os.path.getsize(wav_path) > 0:
+            print(f"[TTS] ✅ Generated: {wav_path}")
             return True
+        return False
+    except subprocess.TimeoutExpired:
+        print(f"[TTS] ❌ Piper timeout")
+        process.kill()
         return False
     except Exception as e:
+        print(f"[TTS] ❌ Piper error: {e}")
+        return False
 @app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket):
 async def speak(text: str = Form(...), voice: str = Form("female"), language: str = Form("en")):
     t_start = time.time()
     uid = uuid.uuid4().hex[:8]
+    wav_path = os.path.join(OUT_DIR, f"{uid}.wav")
     mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
     # Select voice based on preferences
     lang_map = VOICE_MAP.get(language, VOICE_MAP["en"])
     selected_voice = lang_map.get(voice, lang_map["female"])
     print(f"\n{'='*60}")
+    print(f"[Backend] [{time.strftime('%H:%M:%S')}] PIPER TTS GENERATION")
     print(f"[Backend] Text: '{text}'")
     print(f"[Backend] Params: voice={voice}, lang={language} -> {selected_voice}")
     try:
+        # Step 1: Generate TTS with Piper
         t1 = time.time()
+        success = generate_piper_tts(text, wav_path, selected_voice)
+        if not success:
+            raise Exception("Piper TTS generation failed")
         t2 = time.time()
         print(f"[Backend] [+{t2-t_start:.2f}s] Audio generated ({t2-t1:.2f}s)")
+        # Step 2: Convert WAV to MP3 (optional, for better web compatibility)
         try:
+            audio = AudioSegment.from_wav(wav_path)
+            audio.export(mp3_path, format="mp3", bitrate="128k")
             duration_sec = len(audio) / 1000.0
+            # Clean up WAV file
+            os.remove(wav_path)
+            audio_file = mp3_path
+            print(f"[Backend] ✅ Converted to MP3")
+        except Exception as conv_err:
+            print(f"[Backend] ⚠️ MP3 conversion failed, using WAV: {conv_err}")
+            # Read duration from WAV
+            with wave.open(wav_path, 'rb') as wf:
+                frames = wf.getnframes()
+                rate = wf.getframerate()
+                duration_sec = frames / float(rate)
+            audio_file = wav_path
         # Step 3: Generate visemes
         t3 = time.time()
         print(f"[Backend] [+{t4-t_start:.2f}s] Visemes generated: {len(visemes)}")
         response_data = {
+            "audio_url": f"/static/{os.path.basename(audio_file)}",
             "visemes": visemes,
             "duration": duration_sec,
             "text": text
             content={"error": error_msg, "text": text}
         )
+@app.get("/")
+async def root():
+    """Health check endpoint"""
+    return {
+        "status": "running",
+        "tts_engine": "piper",
+        "voices": VOICE_MAP
+    }
 if __name__ == "__main__":
     import uvicorn
+    print("🚀 Piper TTS Server starting on port 8765...")
+    print("📦 Make sure Piper voices are installed!")
+    print("   Download from: https://github.com/rhasspy/piper/releases")
+    print("   Or use: piper --download-dir ./voices --list-voices")
     uvicorn.run(app, host="0.0.0.0", port=8765)

mrrrme/audio/voice_assistant.py CHANGED Viewed

@@ -1,55 +1,76 @@
-"""Text-to-Speech using Edge TTS (FREE & High Quality) - DOCKER/HF SPACES VERSION"""
 import os
 import time
 import tempfile
 import threading
-import asyncio
 import pygame
-import edge_tts
 from dotenv import load_dotenv
 load_dotenv()
 # ========== Configuration ==========
-# Edge TTS Voices (Free, High Quality)
-# Check `edge-tts --list-voices` for more
 VOICE_MAP = {
-    "female": "en-US-AriaNeural",      # Standard Friendly Female
-    "male": "en-US-GuyNeural",         # Standard Friendly Male
-    "Happy": "en-US-AriaNeural",
-    "Sad": "en-US-AriaNeural",
-    "Angry": "en-US-AriaNeural",
-    "Neutral": "en-US-AriaNeural",
 }
 class VoiceAssistant:
     """
-    Edge TTS (Free Microsoft Azure Voices)
-    High quality, no account required, unlimited free usage.
     """
-    def __init__(self, voice: str = "female", rate: str = "+0%"):
         self.voice_key = voice
-        self.voice_name = VOICE_MAP.get(voice, "en-US-AriaNeural")
-        self.rate = rate
-        self.pitch = "+0Hz"
         self.counter = 0
         self.is_speaking = False
         self.speaking_lock = threading.Lock()
         self.audio_workers = []
         # Init pygame for playback
         print("[TTS] 🔧 Initializing pygame...")
         try:
             pygame.mixer.quit()
-            pygame.mixer.init(frequency=24000, size=-16, channels=1, buffer=2048)
             print(f"[TTS] ✅ Pygame ready")
         except Exception as e:
             print(f"[TTS] ⚠️ Pygame warning: {e}")
-        print(f"[TTS] ✅ Ready with Edge TTS ({self.voice_name})!\n")
     def register_audio_worker(self, worker):
         self.audio_workers.append(worker)
@@ -60,15 +81,28 @@ class VoiceAssistant:
         if voice_key in VOICE_MAP:
             self.voice_name = VOICE_MAP[voice_key]
             self.voice_key = voice_key
-            print(f"[TTS] 🎙️ voice → {self.voice_name}")
-        elif voice_key.startswith("en-") or voice_key.startswith("nl-"):
-            self.voice_name = voice_key # Allow direct setting
-    def set_rate(self, rate: str):
-        self.rate = rate
     def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
-        pass
     def stop(self):
         print("[TTS] 🛑 STOP")
@@ -83,36 +117,58 @@ class VoiceAssistant:
         for worker in self.audio_workers:
             if hasattr(worker, 'resume_listening'):
-                try: worker.resume_listening()
-                except: pass
-    def _get_unique_filename(self, ext: str = ".mp3"):
         self.counter += 1
         return os.path.join(
             tempfile.gettempdir(),
-            f"edge_{self.counter}_{int(time.time() * 1000)}{ext}"
         )
-    async def _generate_speech_async(self, text: str, filename: str):
-        """Async generation function"""
-        communicate = edge_tts.Communicate(text, self.voice_name, rate=self.rate, pitch=self.pitch)
-        await communicate.save(filename)
     def _generate_speech(self, text: str, filename: str):
         try:
-            print(f"[TTS] 🔧 Generating with Edge TTS...")
             start = time.time()
-            # Run async function in synchronous wrapper
-            asyncio.run(self._generate_speech_async(text, filename))
             gen_time = time.time() - start
-            if os.path.exists(filename) and os.path.getsize(filename) > 0:
                 print(f"[TTS] ✅ Generated in {gen_time:.2f}s")
                 return True
-            return False
         except Exception as e:
             print(f"[TTS] ❌ Error: {e}")
             return False
@@ -146,13 +202,15 @@ class VoiceAssistant:
         # Pause workers
         for worker in self.audio_workers:
             if hasattr(worker, 'pause_listening'):
-                try: worker.pause_listening()
-                except: pass
         with self.speaking_lock:
             self.is_speaking = True
-        temp_file = self._get_unique_filename(".mp3")
         try:
             if self._generate_speech(text, temp_file):
@@ -161,7 +219,8 @@ class VoiceAssistant:
                 try:
                     if os.path.exists(temp_file):
                         os.remove(temp_file)
-                except: pass
         except Exception as e:
             print(f"[TTS] ❌ Error: {e}")
@@ -174,8 +233,10 @@ class VoiceAssistant:
             # Resume workers
             for worker in self.audio_workers:
                 if hasattr(worker, 'resume_listening'):
-                    try: worker.resume_listening()
-                    except: pass
     def speak_async(self, text: str):
         threading.Thread(target=self.speak, args=(text,), daemon=True).start()

+"""Text-to-Speech using Piper TTS (FREE, LOCAL & FAST) - DOCKER/HF SPACES VERSION"""
 import os
 import time
 import tempfile
 import threading
+import subprocess
 import pygame
+from pydub import AudioSegment
 from dotenv import load_dotenv
 load_dotenv()
 # ========== Configuration ==========
+# Piper TTS Voices (Local, High Quality, FAST)
+# Download voices from: https://github.com/rhasspy/piper/releases/tag/v1.2.0
 VOICE_MAP = {
+    "female": "en_US-amy-medium",      # Clear, natural female voice
+    "male": "en_US-ryan-high",         # Clear, natural male voice
+    "Happy": "en_US-amy-medium",
+    "Sad": "en_US-amy-low",
+    "Angry": "en_US-joe-medium",
+    "Neutral": "en_US-amy-medium",
 }
+# Piper executable path
+PIPER_PATH = os.environ.get("PIPER_PATH", "piper")
 class VoiceAssistant:
     """
+    Piper TTS (Local Neural TTS)
+    - 100% offline
+    - Real-time generation
+    - High quality voices
+    - No API keys required
     """
+    def __init__(self, voice: str = "female", rate: float = 1.0):
         self.voice_key = voice
+        self.voice_name = VOICE_MAP.get(voice, "en_US-amy-medium")
+        self.rate = rate  # Speech rate multiplier (0.5-2.0)
         self.counter = 0
         self.is_speaking = False
         self.speaking_lock = threading.Lock()
         self.audio_workers = []
+        # Check if Piper is installed
+        try:
+            result = subprocess.run(
+                [PIPER_PATH, "--version"],
+                capture_output=True,
+                text=True,
+                timeout=5
+            )
+            print(f"[TTS] ✅ Piper version: {result.stdout.strip()}")
+        except FileNotFoundError:
+            print(f"[TTS] ❌ Piper not found! Install with: pip install piper-tts")
+            print(f"[TTS] Or download from: https://github.com/rhasspy/piper/releases")
+            raise
+        except Exception as e:
+            print(f"[TTS] ⚠️ Piper check failed: {e}")
         # Init pygame for playback
         print("[TTS] 🔧 Initializing pygame...")
         try:
             pygame.mixer.quit()
+            pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
             print(f"[TTS] ✅ Pygame ready")
         except Exception as e:
             print(f"[TTS] ⚠️ Pygame warning: {e}")
+        print(f"[TTS] ✅ Ready with Piper TTS ({self.voice_name})!\n")
     def register_audio_worker(self, worker):
         self.audio_workers.append(worker)
         if voice_key in VOICE_MAP:
             self.voice_name = VOICE_MAP[voice_key]
             self.voice_key = voice_key
+            print(f"[TTS] 🎙️ Voice → {self.voice_name}")
+        else:
+            # Allow direct voice name setting
+            self.voice_name = voice_key
+    def set_rate(self, rate: float):
+        """Set speech rate (0.5 = slow, 1.0 = normal, 2.0 = fast)"""
+        self.rate = max(0.5, min(2.0, rate))
+        print(f"[TTS] 🎚️ Rate → {self.rate}x")
     def apply_emotion_voice(self, emotion: str, intensity: float = 0.5):
+        """Apply emotion-specific voice settings"""
+        # Piper doesn't have built-in emotion control
+        # But we can adjust rate and potentially voice
+        if emotion == "Happy":
+            self.rate = 1.1  # Slightly faster
+        elif emotion == "Sad":
+            self.rate = 0.9  # Slightly slower
+        elif emotion == "Angry":
+            self.rate = 1.2  # Faster
+        else:  # Neutral
+            self.rate = 1.0
     def stop(self):
         print("[TTS] 🛑 STOP")
         for worker in self.audio_workers:
             if hasattr(worker, 'resume_listening'):
+                try:
+                    worker.resume_listening()
+                except:
+                    pass
+    def _get_unique_filename(self, ext: str = ".wav"):
         self.counter += 1
         return os.path.join(
             tempfile.gettempdir(),
+            f"piper_{self.counter}_{int(time.time() * 1000)}{ext}"
         )
     def _generate_speech(self, text: str, filename: str):
+        """Generate speech using Piper TTS"""
         try:
+            print(f"[TTS] 🔧 Generating with Piper...")
             start = time.time()
+            # Piper command: echo "text" | piper --model voice --output_file output.wav
+            cmd = [
+                PIPER_PATH,
+                "--model", self.voice_name,
+                "--output_file", filename
+            ]
+            # Add length scale (rate) if not 1.0
+            if self.rate != 1.0:
+                cmd.extend(["--length_scale", str(1.0 / self.rate)])
+            process = subprocess.Popen(
+                cmd,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            stdout, stderr = process.communicate(input=text, timeout=30)
             gen_time = time.time() - start
+            if process.returncode == 0 and os.path.exists(filename) and os.path.getsize(filename) > 0:
                 print(f"[TTS] ✅ Generated in {gen_time:.2f}s")
                 return True
+            else:
+                print(f"[TTS] ❌ Generation failed: {stderr}")
+                return False
+        except subprocess.TimeoutExpired:
+            print(f"[TTS] ❌ Timeout")
+            process.kill()
+            return False
         except Exception as e:
             print(f"[TTS] ❌ Error: {e}")
             return False
         # Pause workers
         for worker in self.audio_workers:
             if hasattr(worker, 'pause_listening'):
+                try:
+                    worker.pause_listening()
+                except:
+                    pass
         with self.speaking_lock:
             self.is_speaking = True
+        temp_file = self._get_unique_filename(".wav")
         try:
             if self._generate_speech(text, temp_file):
                 try:
                     if os.path.exists(temp_file):
                         os.remove(temp_file)
+                except:
+                    pass
         except Exception as e:
             print(f"[TTS] ❌ Error: {e}")
             # Resume workers
             for worker in self.audio_workers:
                 if hasattr(worker, 'resume_listening'):
+                    try:
+                        worker.resume_listening()
+                    except:
+                        pass
     def speak_async(self, text: str):
         threading.Thread(target=self.speak, args=(text,), daemon=True).start()

requirements_docker.txt CHANGED Viewed

@@ -31,11 +31,11 @@ pyaudio==0.2.14
 pydub==0.25.1
 sounddevice==0.5.1
 webrtcvad==2.0.10
-# openai-whisper removed - using browser Speech Recognition API instead
-# Text-to-Speech
-elevenlabs==1.9.0
 pygame==2.6.1
 # Text processing
 nltk==3.9.1

 pydub==0.25.1
 sounddevice==0.5.1
 webrtcvad==2.0.10
+# Text-to-Speech (PIPER - Local & Fast)
+piper-tts==1.2.0
 pygame==2.6.1
+onnxruntime==1.16.3
 # Text processing
 nltk==3.9.1