Spaces:

michon
/

mrrrme-emotion-ai

Sleeping

App Files Files Community

michon commited on 30 days ago

Commit

863b63a

1 Parent(s): 9901f37

restore

Browse files

Files changed (2) hide show

avatar/- +29 -0
avatar/speak_server.py +186 -0

avatar/- ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "metadata": {
+    "soundFile": "C:\\Users\\Michon.DESKTOP-ALISOTL\\Documents\\GitHub\\2025-26ab-fai3-specialisation-project-team-mrrrme\\MrrrMe\\avatar\\static\\91c72247.wav",
+    "duration": 4.41
+  },
+  "mouthCues": [
+    { "start": 0.00, "end": 0.12, "value": "X" },
+    { "start": 0.12, "end": 0.37, "value": "D" },
+    { "start": 0.37, "end": 0.44, "value": "C" },
+    { "start": 0.44, "end": 0.73, "value": "B" },
+    { "start": 0.73, "end": 0.76, "value": "C" },
+    { "start": 0.76, "end": 0.84, "value": "A" },
+    { "start": 0.84, "end": 0.90, "value": "B" },
+    { "start": 0.90, "end": 0.96, "value": "C" },
+    { "start": 0.96, "end": 1.03, "value": "D" },
+    { "start": 1.03, "end": 1.27, "value": "B" },
+    { "start": 1.27, "end": 1.32, "value": "D" },
+    { "start": 1.32, "end": 1.37, "value": "B" },
+    { "start": 1.37, "end": 1.45, "value": "A" },
+    { "start": 1.45, "end": 1.66, "value": "B" },
+    { "start": 1.66, "end": 2.56, "value": "X" },
+    { "start": 2.56, "end": 2.74, "value": "C" },
+    { "start": 2.74, "end": 2.81, "value": "B" },
+    { "start": 2.81, "end": 3.02, "value": "F" },
+    { "start": 3.02, "end": 3.30, "value": "B" },
+    { "start": 3.30, "end": 3.58, "value": "F" },
+    { "start": 3.58, "end": 4.41, "value": "X" }
+  ]
+}

avatar/speak_server.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""Avatar Backend - ELEVENLABS TTS + PATTERN-BASED LIP SYNC"""
+import os, json, uuid, time, re, asyncio
+from fastapi import FastAPI, Form, WebSocket
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import JSONResponse
+from pydub import AudioSegment
+from elevenlabs.client import ElevenLabs
+from typing import List
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+OUT_DIR = "/tmp/avatar_static"
+os.makedirs(OUT_DIR, exist_ok=True)
+# ElevenLabs configuration
+ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
+if not ELEVENLABS_API_KEY:
+    print("[TTS] ⚠️ ELEVENLABS_API_KEY not set - TTS will fail!")
+    client = None
+else:
+    client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
+    print("[TTS] ✅ ElevenLabs client initialized")
+VOICE = os.getenv("ELEVENLABS_VOICE", "pFZP5JQG7iQjIQuC4Bku")
+ELEVENLABS_MODEL = "eleven_multilingual_v2"
+app = FastAPI()
+app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
+app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
+active_connections: List[WebSocket] = []
+def text_to_visemes_simple(text: str, duration: float):
+    """
+    Generate visemes from text patterns - SUPER FAST!
+    Good quality, instant generation.
+    """
+    visemes = []
+    words = text.split()
+    time_per_word = duration / max(len(words), 1)
+    current_time = 0.0
+    for word in words:
+        word_lower = word.lower().strip('.,!?')
+        for i, char in enumerate(word_lower):
+            char_time = current_time + (i / len(word_lower)) * time_per_word
+            if char in 'aá':
+                visemes.append({"t": round(char_time, 3), "blend": {"jawOpen": 0.6}})
+            elif char in 'eé':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.4, "jawOpen": 0.2}})
+            elif char in 'ií':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.5, "jawOpen": 0.1}})
+            elif char in 'oó':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthFunnel": 0.6, "jawOpen": 0.3}})
+            elif char in 'uú':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.1}})
+            elif char in 'fv':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}})
+            elif char in 'mpb':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.4}})
+            elif char in 'w':
+                visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.2}})
+        current_time += time_per_word
+    return visemes
+async def tts_to_mp3(text: str, mp3_path: str):
+    """Convert text to speech using ElevenLabs"""
+    if not client:
+        raise Exception("ElevenLabs API key not configured")
+    try:
+        print(f"[TTS] 🔧 Generating with ElevenLabs (voice: {VOICE})...")
+        start = time.time()
+        loop = asyncio.get_event_loop()
+        def generate_audio():
+            audio_generator = client.generate(
+                text=text,
+                voice=VOICE,
+                model=ELEVENLABS_MODEL
+            )
+            audio_bytes = b"".join(audio_generator)
+            with open(mp3_path, "wb") as f:
+                f.write(audio_bytes)
+        await loop.run_in_executor(None, generate_audio)
+        gen_time = time.time() - start
+        if Path(mp3_path).exists() and Path(mp3_path).stat().st_size > 1000:
+            print(f"[TTS] ✅ Generated in {gen_time:.2f}s")
+            return
+        else:
+            raise Exception("Generated file too small or missing")
+    except Exception as e:
+        print(f"[TTS] ❌ ElevenLabs error: {e}")
+        raise Exception(f"TTS failed: {e}")
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    active_connections.append(websocket)
+    print(f"[WebSocket] ✅ Client connected. Total: {len(active_connections)}")
+    try:
+        while True:
+            await websocket.receive_text()
+    except:
+        pass
+    finally:
+        active_connections.remove(websocket)
+async def broadcast_to_avatars(data: dict):
+    for connection in active_connections[:]:
+        try:
+            await connection.send_json(data)
+        except:
+            active_connections.remove(connection)
+@app.post("/speak")
+async def speak(text: str = Form(...)):
+    t_start = time.time()
+    uid = uuid.uuid4().hex[:8]
+    mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
+    print(f"\n{'='*60}")
+    print(f"[Backend] [{time.strftime('%H:%M:%S')}] ELEVENLABS TTS")
+    print(f"[Backend] Text: '{text}'")
+    try:
+        # Step 1: Generate TTS with ElevenLabs
+        t1 = time.time()
+        await tts_to_mp3(text, mp3_path)
+        t2 = time.time()
+        print(f"[Backend] [+{t2-t_start:.2f}s] TTS done ({t2-t1:.2f}s)")
+        # Step 2: Get audio duration
+        try:
+            audio = AudioSegment.from_file(mp3_path)
+            duration_sec = len(audio) / 1000.0
+        except Exception as e:
+            print(f"[Backend] ⚠️ Could not read audio file: {e}")
+            duration_sec = len(text) * 0.06
+        # Step 3: Generate visemes (pattern-based - instant!)
+        t3 = time.time()
+        visemes = text_to_visemes_simple(text, duration_sec)
+        t4 = time.time()
+        print(f"[Backend] [+{t4-t_start:.2f}s] Pattern visemes: {len(visemes)} ({t4-t3:.3f}s)")
+        t_end = time.time()
+        print(f"[Backend] ✅ TOTAL: {t_end-t_start:.2f}s")
+        print(f"[Backend] Breakdown: TTS={t2-t1:.2f}s, Visemes={t4-t3:.3f}s")
+        print(f"{'='*60}\n")
+        response_data = {
+            "audio_url": f"/static/{os.path.basename(mp3_path)}",
+            "visemes": visemes,
+            "duration": duration_sec,
+            "text": text
+        }
+        await broadcast_to_avatars(response_data)
+        return response_data
+    except Exception as e:
+        error_msg = f"Failed to generate speech: {str(e)}"
+        print(f"[Backend] ❌ ERROR: {error_msg}")
+        print(f"{'='*60}\n")
+        return JSONResponse(
+            status_code=500,
+            content={"error": error_msg, "text": text}
+        )
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8765)