michon commited on
Commit
863b63a
·
1 Parent(s): 9901f37
Files changed (2) hide show
  1. avatar/- +29 -0
  2. avatar/speak_server.py +186 -0
avatar/- ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "soundFile": "C:\\Users\\Michon.DESKTOP-ALISOTL\\Documents\\GitHub\\2025-26ab-fai3-specialisation-project-team-mrrrme\\MrrrMe\\avatar\\static\\91c72247.wav",
4
+ "duration": 4.41
5
+ },
6
+ "mouthCues": [
7
+ { "start": 0.00, "end": 0.12, "value": "X" },
8
+ { "start": 0.12, "end": 0.37, "value": "D" },
9
+ { "start": 0.37, "end": 0.44, "value": "C" },
10
+ { "start": 0.44, "end": 0.73, "value": "B" },
11
+ { "start": 0.73, "end": 0.76, "value": "C" },
12
+ { "start": 0.76, "end": 0.84, "value": "A" },
13
+ { "start": 0.84, "end": 0.90, "value": "B" },
14
+ { "start": 0.90, "end": 0.96, "value": "C" },
15
+ { "start": 0.96, "end": 1.03, "value": "D" },
16
+ { "start": 1.03, "end": 1.27, "value": "B" },
17
+ { "start": 1.27, "end": 1.32, "value": "D" },
18
+ { "start": 1.32, "end": 1.37, "value": "B" },
19
+ { "start": 1.37, "end": 1.45, "value": "A" },
20
+ { "start": 1.45, "end": 1.66, "value": "B" },
21
+ { "start": 1.66, "end": 2.56, "value": "X" },
22
+ { "start": 2.56, "end": 2.74, "value": "C" },
23
+ { "start": 2.74, "end": 2.81, "value": "B" },
24
+ { "start": 2.81, "end": 3.02, "value": "F" },
25
+ { "start": 3.02, "end": 3.30, "value": "B" },
26
+ { "start": 3.30, "end": 3.58, "value": "F" },
27
+ { "start": 3.58, "end": 4.41, "value": "X" }
28
+ ]
29
+ }
avatar/speak_server.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Avatar Backend - ELEVENLABS TTS + PATTERN-BASED LIP SYNC"""
2
+ import os, json, uuid, time, re, asyncio
3
+ from fastapi import FastAPI, Form, WebSocket
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.staticfiles import StaticFiles
6
+ from fastapi.responses import JSONResponse
7
+ from pydub import AudioSegment
8
+ from elevenlabs.client import ElevenLabs
9
+ from typing import List
10
+ from pathlib import Path
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ OUT_DIR = "/tmp/avatar_static"
16
+ os.makedirs(OUT_DIR, exist_ok=True)
17
+
18
+ # ElevenLabs configuration
19
+ ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
20
+ if not ELEVENLABS_API_KEY:
21
+ print("[TTS] ⚠️ ELEVENLABS_API_KEY not set - TTS will fail!")
22
+ client = None
23
+ else:
24
+ client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
25
+ print("[TTS] ✅ ElevenLabs client initialized")
26
+
27
+ VOICE = os.getenv("ELEVENLABS_VOICE", "pFZP5JQG7iQjIQuC4Bku")
28
+ ELEVENLABS_MODEL = "eleven_multilingual_v2"
29
+
30
+ app = FastAPI()
31
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
32
+ app.mount("/static", StaticFiles(directory=OUT_DIR), name="static")
33
+
34
+ active_connections: List[WebSocket] = []
35
+
36
+ def text_to_visemes_simple(text: str, duration: float):
37
+ """
38
+ Generate visemes from text patterns - SUPER FAST!
39
+ Good quality, instant generation.
40
+ """
41
+ visemes = []
42
+ words = text.split()
43
+ time_per_word = duration / max(len(words), 1)
44
+ current_time = 0.0
45
+
46
+ for word in words:
47
+ word_lower = word.lower().strip('.,!?')
48
+
49
+ for i, char in enumerate(word_lower):
50
+ char_time = current_time + (i / len(word_lower)) * time_per_word
51
+
52
+ if char in 'aá':
53
+ visemes.append({"t": round(char_time, 3), "blend": {"jawOpen": 0.6}})
54
+ elif char in 'eé':
55
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.4, "jawOpen": 0.2}})
56
+ elif char in 'ií':
57
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthSmile": 0.5, "jawOpen": 0.1}})
58
+ elif char in 'oó':
59
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthFunnel": 0.6, "jawOpen": 0.3}})
60
+ elif char in 'uú':
61
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.1}})
62
+ elif char in 'fv':
63
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthPressLeft": 0.5, "mouthPressRight": 0.5}})
64
+ elif char in 'mpb':
65
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.4}})
66
+ elif char in 'w':
67
+ visemes.append({"t": round(char_time, 3), "blend": {"mouthPucker": 0.5, "jawOpen": 0.2}})
68
+
69
+ current_time += time_per_word
70
+
71
+ return visemes
72
+
73
+ async def tts_to_mp3(text: str, mp3_path: str):
74
+ """Convert text to speech using ElevenLabs"""
75
+ if not client:
76
+ raise Exception("ElevenLabs API key not configured")
77
+
78
+ try:
79
+ print(f"[TTS] 🔧 Generating with ElevenLabs (voice: {VOICE})...")
80
+ start = time.time()
81
+
82
+ loop = asyncio.get_event_loop()
83
+
84
+ def generate_audio():
85
+ audio_generator = client.generate(
86
+ text=text,
87
+ voice=VOICE,
88
+ model=ELEVENLABS_MODEL
89
+ )
90
+ audio_bytes = b"".join(audio_generator)
91
+ with open(mp3_path, "wb") as f:
92
+ f.write(audio_bytes)
93
+
94
+ await loop.run_in_executor(None, generate_audio)
95
+
96
+ gen_time = time.time() - start
97
+
98
+ if Path(mp3_path).exists() and Path(mp3_path).stat().st_size > 1000:
99
+ print(f"[TTS] ✅ Generated in {gen_time:.2f}s")
100
+ return
101
+ else:
102
+ raise Exception("Generated file too small or missing")
103
+
104
+ except Exception as e:
105
+ print(f"[TTS] ❌ ElevenLabs error: {e}")
106
+ raise Exception(f"TTS failed: {e}")
107
+
108
+ @app.websocket("/ws")
109
+ async def websocket_endpoint(websocket: WebSocket):
110
+ await websocket.accept()
111
+ active_connections.append(websocket)
112
+ print(f"[WebSocket] ✅ Client connected. Total: {len(active_connections)}")
113
+ try:
114
+ while True:
115
+ await websocket.receive_text()
116
+ except:
117
+ pass
118
+ finally:
119
+ active_connections.remove(websocket)
120
+
121
+ async def broadcast_to_avatars(data: dict):
122
+ for connection in active_connections[:]:
123
+ try:
124
+ await connection.send_json(data)
125
+ except:
126
+ active_connections.remove(connection)
127
+
128
+ @app.post("/speak")
129
+ async def speak(text: str = Form(...)):
130
+ t_start = time.time()
131
+ uid = uuid.uuid4().hex[:8]
132
+ mp3_path = os.path.join(OUT_DIR, f"{uid}.mp3")
133
+
134
+ print(f"\n{'='*60}")
135
+ print(f"[Backend] [{time.strftime('%H:%M:%S')}] ELEVENLABS TTS")
136
+ print(f"[Backend] Text: '{text}'")
137
+
138
+ try:
139
+ # Step 1: Generate TTS with ElevenLabs
140
+ t1 = time.time()
141
+ await tts_to_mp3(text, mp3_path)
142
+ t2 = time.time()
143
+ print(f"[Backend] [+{t2-t_start:.2f}s] TTS done ({t2-t1:.2f}s)")
144
+
145
+ # Step 2: Get audio duration
146
+ try:
147
+ audio = AudioSegment.from_file(mp3_path)
148
+ duration_sec = len(audio) / 1000.0
149
+ except Exception as e:
150
+ print(f"[Backend] ⚠️ Could not read audio file: {e}")
151
+ duration_sec = len(text) * 0.06
152
+
153
+ # Step 3: Generate visemes (pattern-based - instant!)
154
+ t3 = time.time()
155
+ visemes = text_to_visemes_simple(text, duration_sec)
156
+ t4 = time.time()
157
+
158
+ print(f"[Backend] [+{t4-t_start:.2f}s] Pattern visemes: {len(visemes)} ({t4-t3:.3f}s)")
159
+
160
+ t_end = time.time()
161
+ print(f"[Backend] ✅ TOTAL: {t_end-t_start:.2f}s")
162
+ print(f"[Backend] Breakdown: TTS={t2-t1:.2f}s, Visemes={t4-t3:.3f}s")
163
+ print(f"{'='*60}\n")
164
+
165
+ response_data = {
166
+ "audio_url": f"/static/{os.path.basename(mp3_path)}",
167
+ "visemes": visemes,
168
+ "duration": duration_sec,
169
+ "text": text
170
+ }
171
+
172
+ await broadcast_to_avatars(response_data)
173
+ return response_data
174
+
175
+ except Exception as e:
176
+ error_msg = f"Failed to generate speech: {str(e)}"
177
+ print(f"[Backend] ❌ ERROR: {error_msg}")
178
+ print(f"{'='*60}\n")
179
+ return JSONResponse(
180
+ status_code=500,
181
+ content={"error": error_msg, "text": text}
182
+ )
183
+
184
+ if __name__ == "__main__":
185
+ import uvicorn
186
+ uvicorn.run(app, host="0.0.0.0", port=8765)