KevanSoon commited on
Commit
f147852
·
1 Parent(s): 89ca815

first project init

Browse files
app.py ADDED
@@ -0,0 +1,610 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import asyncio
4
+ import re
5
+ import os
6
+ import html
7
+ import requests
8
+ import httpx
9
+ import uuid
10
+ from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.responses import HTMLResponse
13
+ from fastapi import Depends
14
+ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
15
+ from pydantic import BaseModel
16
+ from requests.exceptions import RequestException
17
+ from dotenv import load_dotenv
18
+ import google.generativeai as genai
19
+ from google.api_core import exceptions as google_exceptions
20
+ from pydantic import BaseModel
21
+ from auth.clerk import verify_clerk_jwt
22
+ from tools.tools import extract_text_from_html, generate_document_insights, analyze_keywords_with_web_search
23
+
24
+
25
+ security = HTTPBearer()
26
+ # Load environment variables from a .env file
27
+ load_dotenv()
28
+
29
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
30
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
31
+
32
+ app = FastAPI(
33
+ title="Document Translator (Final Architecture)",
34
+ description="Pipeline: Nemo (JSON) -> Sea-Lion (Translate JSON) -> Gemini (HTML)",
35
+ version="10.0.1", # Final Architecture, patched
36
+ )
37
+
38
+ # Allow requests from the default React frontend port
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=["http://localhost:3000"],
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+
48
+ def wrap_words_with_spans(html: str) -> str:
49
+ # Wrap each word in target tags with a span having data attributes
50
+ def replacer(match):
51
+ replacer.counter += 1
52
+ word = match.group(0)
53
+ return f'<span data-clickable="true" data-id="word-{replacer.counter}">{word}</span>'
54
+ replacer.counter = 0
55
+
56
+ pattern = r'\b\w+[.,?!]?\b' # matches words with optional trailing punctuation
57
+
58
+ for tag in ['p', 'h1', 'h2', 'td']:
59
+ # regex to capture content inside these tags
60
+ regex = re.compile(fr'(<{tag}[^>]*>)(.*?)(</{tag}>)', re.DOTALL)
61
+ def replacer_func(m):
62
+ open_tag, inner_text, close_tag = m.groups()
63
+ wrapped_text = re.sub(pattern, replacer, inner_text)
64
+ return open_tag + wrapped_text + close_tag
65
+ html = regex.sub(replacer_func, html)
66
+
67
+ return html
68
+
69
+ def inject_dropdown_script(html: str) -> str:
70
+ script = """
71
+ <script>
72
+ window.addEventListener('DOMContentLoaded', () => {
73
+
74
+ function createDropdown(x, y, wordEl, word) {
75
+ // Remove any existing dropdown
76
+ const oldDropdown = document.getElementById('translation-dropdown');
77
+ if (oldDropdown) oldDropdown.remove();
78
+
79
+ // Create dropdown select element
80
+ const dropdown = document.createElement('select');
81
+ dropdown.id = 'translation-dropdown';
82
+ dropdown.style.position = 'absolute';
83
+ dropdown.style.left = x + 'px';
84
+ dropdown.style.top = y + 'px';
85
+ dropdown.style.zIndex = 9999;
86
+
87
+ // Languages options
88
+ const languages = ['English', 'Chinese', 'Tamil', 'Hindi'];
89
+ languages.forEach(lang => {
90
+ const option = document.createElement('option');
91
+ option.value = lang.toLowerCase();
92
+ option.innerText = lang;
93
+ dropdown.appendChild(option);
94
+ });
95
+
96
+ // Placeholder option
97
+ const defaultOption = document.createElement('option');
98
+ defaultOption.value = '';
99
+ defaultOption.innerText = 'Select language';
100
+ defaultOption.selected = true;
101
+ defaultOption.disabled = true;
102
+ dropdown.insertBefore(defaultOption, dropdown.firstChild);
103
+
104
+ document.body.appendChild(dropdown);
105
+ dropdown.focus();
106
+
107
+ dropdown.addEventListener('change', () => {
108
+ const selectedLang = dropdown.value;
109
+ if (!selectedLang) return;
110
+
111
+ // Call backend to translate word
112
+ fetch('http://localhost:8080/api/translate_frontend', {
113
+ method: 'POST',
114
+ headers: { 'Content-Type': 'application/json' },
115
+ body: JSON.stringify({ text: word, target_language: selectedLang }),
116
+ })
117
+ .then(res => {
118
+ if (!res.ok) throw new Error('Translation API error');
119
+ return res.json();
120
+ })
121
+ .then(data => {
122
+ const translated = data.translated_text || word;
123
+ wordEl.innerText = translated;
124
+
125
+ // Add or update language label
126
+ let label = wordEl.nextSibling;
127
+ if (!label || !label.classList || !label.classList.contains('language-label')) {
128
+ label = document.createElement('span');
129
+ label.className = 'language-label';
130
+ label.style.marginLeft = '6px';
131
+ label.style.fontSize = '0.8em';
132
+ label.style.color = '#555';
133
+ wordEl.after(label);
134
+ }
135
+ label.textContent = `(${dropdown.options[dropdown.selectedIndex].text})`;
136
+ })
137
+ .catch(err => {
138
+ console.error('Translation error:', err);
139
+ alert('Translation failed, please try again.');
140
+ });
141
+
142
+ dropdown.remove();
143
+ });
144
+
145
+ // Clicking outside closes dropdown
146
+ document.addEventListener('click', function onDocClick(e) {
147
+ if (!dropdown.contains(e.target)) {
148
+ dropdown.remove();
149
+ document.removeEventListener('click', onDocClick);
150
+ }
151
+ });
152
+ }
153
+
154
+ // Add click handlers to all words wrapped in spans with data-clickable="true"
155
+ document.querySelectorAll('span[data-clickable="true"]').forEach(el => {
156
+ el.style.cursor = 'pointer';
157
+ el.addEventListener('click', event => {
158
+ event.stopPropagation();
159
+ const word = el.innerText;
160
+ const rect = el.getBoundingClientRect();
161
+ const x = rect.left + window.scrollX;
162
+ const y = rect.bottom + window.scrollY;
163
+ createDropdown(x, y, el, word);
164
+ });
165
+ });
166
+
167
+ });
168
+ </script>
169
+ """
170
+ if "</body>" in html:
171
+ return html.replace("</body>", script + "\n</body>")
172
+ else:
173
+ return html + script
174
+
175
+ # Define a Pydantic model to enforce the structure of the incoming request body
176
+ class HtmlAnalysisRequest(BaseModel):
177
+ html: str
178
+
179
+ @app.post("/api/analyze_html")
180
+ async def analyze_html_file(file: UploadFile = File(...)):
181
+ """
182
+ Receives an uploaded HTML file, extracts its text content, and uses the
183
+ Gemini tool to generate a summary and key informational points.
184
+ """
185
+ # Check if the uploaded file is an HTML file
186
+ if file.content_type != "text/html":
187
+ raise HTTPException(status_code=400, detail="Unsupported file type. Please upload a .html file.")
188
+
189
+ try:
190
+ # Step 1: Read the content of the uploaded file
191
+ html_content_bytes = await file.read()
192
+ html_content = html_content_bytes.decode('utf-8')
193
+
194
+ # Step 2: Extract text from the HTML using our tool
195
+ document_text = extract_text_from_html(html_content)
196
+
197
+ # Step 3: Get insights from the Gemini tool
198
+ analysis_results = await generate_document_insights(document_text)
199
+
200
+ # Check if the tool returned a functional error
201
+ if 'error' in analysis_results:
202
+ raise HTTPException(status_code=500, detail=analysis_results['error'])
203
+
204
+ return analysis_results
205
+
206
+ except Exception as e:
207
+ # Catch any other unexpected errors
208
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
209
+
210
+
211
+ @app.post("/api/translate_frontend")
212
+ async def translate_text(request: Request):
213
+ try:
214
+ data = await request.json()
215
+ text = data.get("text")
216
+ target_language = data.get("target_language")
217
+
218
+ if not text or not target_language:
219
+ raise HTTPException(status_code=400, detail="Missing 'text' or 'target_language' in request body")
220
+
221
+ url = "https://api.sea-lion.ai/v1/chat/completions"
222
+ api_key = os.getenv("SEALION_API_KEY")
223
+
224
+ headers = {
225
+ "Authorization": f"Bearer {api_key}",
226
+ "Content-Type": "application/json"
227
+ # No "accept" header or set to "application/json"
228
+ }
229
+
230
+ prompt = (
231
+ f"Please translate the following text to {target_language} and return "
232
+ "ONLY the translated text without any explanations or extra formatting:\n\n"
233
+ f"\"{text}\""
234
+ )
235
+
236
+ payload = {
237
+ "max_completion_tokens": 1024,
238
+ "messages": [
239
+ {
240
+ "role": "user",
241
+ "content": prompt
242
+ }
243
+ ],
244
+ "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT"
245
+ }
246
+
247
+ response = requests.post(url, headers=headers, data=json.dumps(payload))
248
+ response.raise_for_status()
249
+
250
+ # Parse JSON response
251
+ response_json = response.json()
252
+
253
+ # Extract translated text from response JSON
254
+ translated_text = response_json["choices"][0]["message"]["content"].strip()
255
+
256
+ if not translated_text:
257
+ raise HTTPException(status_code=500, detail="Empty response from translation model.")
258
+
259
+ return {"translated_text": translated_text}
260
+
261
+ except requests.exceptions.RequestException as e:
262
+ raise HTTPException(status_code=502, detail=f"Translation API request failed: {e}")
263
+ except Exception as e:
264
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
265
+
266
+
267
+ # --- Model 2: Sea-Lion (The JSON Translator) ---
268
+ @app.post("/api/translate")
269
+ async def translate_text(text: str, target_language: str):
270
+ """
271
+ Receives text and a target language, and returns the translated text
272
+ using the SEA-LION model.
273
+ """
274
+ # The API endpoint URL for translation
275
+ url = "https://api.sea-lion.ai/v1/chat/completions"
276
+
277
+ # It's recommended to store API keys securely, e.g., in environment variables
278
+ api_key = os.getenv("SEALION_API_KEY")
279
+
280
+ # The headers for the request
281
+ headers = {
282
+ "accept": "text/plain",
283
+ "Authorization": f"Bearer {api_key}",
284
+ "Content-Type": "application/json"
285
+ }
286
+
287
+ # Create a dynamic prompt for the translation task
288
+ prompt = f"Translate the following text to {text}: \"{target_language}\""
289
+
290
+ # The JSON data payload for the request
291
+ data = {
292
+ "max_completion_tokens": 4096, # Increased token limit for longer translations
293
+ "messages": [
294
+ {
295
+ "role": "user",
296
+ "content": prompt
297
+ }
298
+ ],
299
+ "model": "aisingapore/Llama-SEA-LION-v3-70B-IT"
300
+ }
301
+
302
+ try:
303
+ # Make the POST request to the SEA-LION API
304
+ response = requests.post(url, headers=headers, data=json.dumps(data))
305
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
306
+
307
+ # The response from this specific API is plain text, not JSON.
308
+ # We will wrap it in a JSON structure for consistency in our API.
309
+ translated_text = response.text
310
+
311
+ # It's good practice to check if the response is empty
312
+ if not translated_text:
313
+ raise HTTPException(status_code=500, detail="Received an empty response from the translation model.")
314
+
315
+ return {"translated_text": translated_text}
316
+
317
+ except requests.exceptions.RequestException as e:
318
+ # Handle network-related errors
319
+ raise HTTPException(status_code=502, detail=f"Failed to communicate with the translation AI model: {e}")
320
+ except Exception as e:
321
+ # Handle other potential errors
322
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred during translation: {e}")
323
+
324
+
325
+ # --- Model 3: Gemini (The HTML Generator) ---
326
+ async def generate_html_from_translated_json(translated_json: dict) -> str:
327
+ """
328
+ Receives a translated JSON object and uses Gemini to generate the final
329
+ structured HTML document.
330
+ """
331
+ try:
332
+ api_key = os.getenv("GEMINI_API_KEY")
333
+ if not api_key:
334
+ raise ValueError("GEMINI_API_KEY not found in environment variables.")
335
+
336
+ genai.configure(api_key=api_key)
337
+ model = genai.GenerativeModel(model_name='gemini-2.0-flash')
338
+ json_string_for_prompt = json.dumps(translated_json, indent=2)
339
+
340
+ prompt = f"""
341
+ You are an expert system that converts a JSON object containing PRE-TRANSLATED text into a clean, semantic HTML document.
342
+
343
+ **Your Task:**
344
+ 1. Analyze the following JSON object. Its text content has already been translated.
345
+ 2. The core document data is located at the path: `choices[0]['message']['tool_calls'][0]['function']['arguments']`.
346
+ 3. The value of 'arguments' is a JSON STRING. You must parse this inner string to access the list of document chunks.
347
+ 4. Using the translated data from the 'text' fields, generate a single, complete HTML5 document. Use appropriate tags like <h1>, <h2>, <p>, and <table>.
348
+ 5. if json contains "tabular" means mmake a table for that with some grey border and styling
349
+ 6. Your final output must ONLY be the raw HTML code. Do not add comments or markdown.
350
+
351
+ **Translated JSON object to process:**
352
+ ```json
353
+ {json_string_for_prompt}
354
+ ```
355
+ """
356
+
357
+ # def do_request():
358
+ # response = model.generate_content(prompt)
359
+ # match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
360
+ # if match:
361
+ # return match.group(1).strip()
362
+ # return response.text.strip()
363
+
364
+ # return await asyncio.to_thread(do_request)
365
+ def do_request():
366
+ response = model.generate_content(prompt)
367
+
368
+ # Extract raw HTML from Gemini markdown code block
369
+ match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
370
+ raw_html = match.group(1).strip() if match else response.text.strip()
371
+
372
+ # Wrap each word in clickable spans
373
+ wrapped_html = wrap_words_with_spans(raw_html)
374
+
375
+ # Inject dropdown script
376
+ final_html = inject_dropdown_script(wrapped_html)
377
+
378
+ return final_html
379
+
380
+ return await asyncio.to_thread(do_request)
381
+ except google_exceptions.ResourceExhausted as e:
382
+ error_message = "The request to the document processor (Gemini) was rejected due to API quota limits. Please wait or upgrade your API plan."
383
+ return f"<html><body><h1>API Quota Error</h1><p>{html.escape(error_message)}</p></body></html>"
384
+ except Exception as e:
385
+ error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
386
+ return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
387
+
388
+
389
+ # --- API Endpoint Orchestrating the Pipeline ---
390
+ @app.post("/api/translate_file", response_class=HTMLResponse)
391
+ async def translate_document_to_raw_html(target_language: str = Form(...), file: UploadFile = File(...)):
392
+ """
393
+ Processes a document using the final, robust pipeline:
394
+ 1. Nemo extracts content to JSON.
395
+ 2. Sea-Lion translates the text within the JSON.
396
+ 3. Gemini generates the final HTML from the translated JSON.
397
+ """
398
+ content_type = file.content_type
399
+ if content_type not in ["application/pdf", "image/png", "image/jpeg"]:
400
+ raise HTTPException(status_code=400, detail="Unsupported file type.")
401
+
402
+ try:
403
+ # === STEP 1: Get raw JSON from Nemo (The Parser) ===
404
+ file_content = await file.read()
405
+ file_b64 = base64.b64encode(file_content).decode("utf-8")
406
+ nemo_data = {
407
+ "model": "nvidia/nemoretriever-parse",
408
+ "messages": [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": f"data:{content_type};base64,{file_b64}"}}]}],
409
+ "max_tokens": 2048,
410
+ }
411
+ headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
412
+ model_response = requests.post('http://localhost:8000/v1/chat/completions', headers=headers, data=json.dumps(nemo_data))
413
+ model_response.raise_for_status()
414
+ nemo_response_json = model_response.json()
415
+ print(nemo_response_json)
416
+ print("*********** Step 1 Done ***********")
417
+
418
+
419
+ print("*********** Step 2 in Progress ***********")
420
+ # === STEP 2: Get translated JSON from Sea-Lion (The Translator) ===
421
+ translated_json = await translate_text(nemo_response_json, target_language)
422
+ print(translated_json)
423
+ print("*********** Step 2 Done ***********")
424
+
425
+ print("*********** Step 3 in Progress ***********")
426
+ # === STEP 3: Generate final HTML from Gemini (The HTML Generator) ===
427
+ final_html = await generate_html_from_translated_json(translated_json)
428
+ print(final_html)
429
+ print("*********** Step 3 Done ***********")
430
+ # Check if Gemini itself returned an error message
431
+ if final_html.strip().startswith("<html><body><h1>"):
432
+ return HTMLResponse(content=final_html)
433
+
434
+ # === STEP 4: Return the final result to the frontend ===
435
+ return HTMLResponse(content=final_html)
436
+
437
+ except requests.exceptions.RequestException as e:
438
+ raise HTTPException(status_code=502, detail=f"Failed to communicate with a downstream AI model: {e}")
439
+ except Exception as e:
440
+ # This will catch any errors, including the ValueError from the Sea-Lion function
441
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred during processing: {e}")
442
+
443
+ @app.post("/api/verify_document_keywords")
444
+ async def verify_document_keywords(
445
+ file: UploadFile = File(...),
446
+ analysis_type: str = Form("legality"),
447
+ search_context: str = Form("Singapore employment law")
448
+ ):
449
+ """
450
+ Receives an HTML file and a configuration via form data, then uses the
451
+ agent-to-agent RAG workflow to identify and verify key claims.
452
+ """
453
+ # Check if the uploaded file is an HTML file
454
+ if file.content_type != "text/html":
455
+ raise HTTPException(status_code=400, detail="Unsupported file type. Please upload a .html file.")
456
+
457
+ try:
458
+ # Step 1: Read content from the uploaded file and extract text
459
+ html_content_bytes = await file.read()
460
+ html_content = html_content_bytes.decode('utf-8')
461
+ document_text = extract_text_from_html(html_content)
462
+
463
+ if not document_text.strip():
464
+ raise HTTPException(
465
+ status_code=400,
466
+ detail="Could not extract any meaningful text from the provided HTML content."
467
+ )
468
+
469
+ # Step 2: Prepare the configuration and call the new analysis tool
470
+ config = {
471
+ "analysis_type": analysis_type,
472
+ "search_context": search_context
473
+ }
474
+ analysis_results = await analyze_keywords_with_web_search(document_text, config)
475
+
476
+ # Step 3: Handle potential errors from the tool
477
+ if 'error' in analysis_results:
478
+ raise HTTPException(status_code=500, detail=analysis_results['error'])
479
+
480
+ # Step 4: Return the successful analysis
481
+ return analysis_results
482
+
483
+ except Exception as e:
484
+ # Catch any other unexpected errors during the process
485
+ raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
486
+
487
+
488
+
489
+ #testing clerk backend authentication
490
+ # @app.post("/upload")
491
+ # async def upload_file(
492
+ # authorization: str = Header(...),
493
+ # file: UploadFile = File(...)
494
+ # ):
495
+ # if not authorization.startswith("Bearer "):
496
+ # raise HTTPException(status_code=401, detail="Missing Bearer token")
497
+
498
+ # token = authorization.split(" ")[1]
499
+ # claims = await verify_clerk_jwt(token)
500
+
501
+ # user_id = claims.get("sub") # Clerk user ID
502
+
503
+ # # ✅ Now the Clerk user is verified
504
+ # # You can securely store this file, e.g., to Supabase or local
505
+ # return {"message": f"File uploaded by Clerk user {user_id}"}
506
+
507
+ @app.post("/upload")
508
+ async def upload_file(
509
+ authorization: str = Header(...),
510
+ file: UploadFile = File(...)
511
+ ):
512
+ if not authorization.startswith("Bearer "):
513
+ raise HTTPException(status_code=401, detail="Missing Bearer token")
514
+
515
+ token = authorization.split(" ")[1]
516
+ claims = await verify_clerk_jwt(token)
517
+
518
+ user_id = claims.get("sub") # Clerk user ID
519
+ filename = f"{user_id}/{uuid.uuid4()}.png"
520
+
521
+ # Upload to Supabase Storage
522
+ async with httpx.AsyncClient() as client:
523
+ upload_resp = await client.post(
524
+ f"{SUPABASE_URL}/storage/v1/object/user-documents/{filename}",
525
+ headers={
526
+ "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
527
+ "Content-Type": file.content_type,
528
+ },
529
+ content=await file.read()
530
+ )
531
+
532
+ if upload_resp.status_code != 200:
533
+ raise HTTPException(status_code=500, detail="Failed to upload to Supabase Storage")
534
+
535
+ file_url = f"user-documents/{filename}"
536
+
537
+ # Insert metadata to `documents` table
538
+ async with httpx.AsyncClient() as client:
539
+ insert_resp = await client.post(
540
+ f"{SUPABASE_URL}/rest/v1/documents",
541
+ headers={
542
+ "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
543
+ "apikey": SUPABASE_SERVICE_ROLE_KEY,
544
+ "Content-Type": "application/json",
545
+ "Prefer": "return=representation"
546
+ },
547
+ json={
548
+ "user_id": user_id,
549
+ "filename": filename.split("/")[-1],
550
+ "file_url": file_url
551
+ }
552
+ )
553
+
554
+ if insert_resp.status_code >= 300:
555
+ raise HTTPException(status_code=500, detail="Failed to insert document metadata")
556
+
557
+ return {"message": f"File uploaded as {filename}"}
558
+
559
+
560
+ @app.get("/api/documents")
561
+ async def get_user_documents(credentials: HTTPAuthorizationCredentials = Depends(security)):
562
+ token = credentials.credentials
563
+ claims = await verify_clerk_jwt(token)
564
+ user_id = claims.get("sub")
565
+ if not user_id:
566
+ raise HTTPException(status_code=401, detail="Invalid user")
567
+
568
+ # Step 1: Get documents from Supabase
569
+ async with httpx.AsyncClient() as client:
570
+ resp = await client.get(
571
+ f"{SUPABASE_URL}/rest/v1/documents?user_id=eq.{user_id}",
572
+ headers={
573
+ "apikey": SUPABASE_SERVICE_ROLE_KEY,
574
+ "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
575
+ "Accept": "application/json",
576
+ },
577
+ )
578
+
579
+ if resp.status_code != 200:
580
+ raise HTTPException(status_code=500, detail="Failed to fetch documents")
581
+
582
+ documents = resp.json()
583
+
584
+ # Step 2: Get signed URLs for each file
585
+ async with httpx.AsyncClient() as client:
586
+ for doc in documents:
587
+ file_path = doc["file_url"].split("user-documents/", 1)[-1]
588
+ if not file_path:
589
+ doc["signed_url"] = None
590
+ continue
591
+
592
+ signed_url_resp = await client.post(
593
+ f"{SUPABASE_URL}/storage/v1/object/sign/user-documents/{file_path}",
594
+ headers={
595
+ "apikey": SUPABASE_SERVICE_ROLE_KEY,
596
+ "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
597
+ # "Content-Type": "application/json"
598
+ },
599
+ json={"expiresIn": 3600}, # 1 hour
600
+ )
601
+
602
+ if signed_url_resp.status_code == 200:
603
+ print(f"{SUPABASE_URL}/storage/v1{signed_url_resp.json().get('signedURL')}")
604
+ doc["signed_url"] = f"{SUPABASE_URL}/storage/v1{signed_url_resp.json().get('signedURL')}"
605
+
606
+ else:
607
+ doc["signed_url"] = None
608
+ print(documents)
609
+
610
+ return documents
auth/__pycache__/clerk.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
auth/clerk.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # auth/clerk.py
2
+ from jose import jwt
3
+ import httpx
4
+ from fastapi import HTTPException
5
+
6
+ CLERK_ISSUER = "https://enabling-terrapin-28.clerk.accounts.dev" # e.g. https://enabling-terrapin-28.clerk.accounts.dev
7
+ CLERK_AUDIENCE = "http://localhost:3000" # Your frontend origin
8
+
9
+ async def verify_clerk_jwt(token: str) -> dict:
10
+ try:
11
+ async with httpx.AsyncClient() as client:
12
+ jwks_url = f"{CLERK_ISSUER}/.well-known/jwks.json"
13
+ resp = await client.get(jwks_url)
14
+ jwks = resp.json()["keys"]
15
+
16
+ unverified_header = jwt.get_unverified_header(token)
17
+ kid = unverified_header.get("kid")
18
+
19
+ key = next((k for k in jwks if k["kid"] == kid), None)
20
+ if not key:
21
+ raise HTTPException(status_code=401, detail="Public key not found")
22
+
23
+ payload = jwt.decode(
24
+ token,
25
+ key,
26
+ algorithms=["RS256"],
27
+ audience=CLERK_AUDIENCE,
28
+ issuer=CLERK_ISSUER
29
+ )
30
+ return payload
31
+ except Exception as e:
32
+ raise HTTPException(status_code=401, detail=f"Invalid Clerk JWT: {str(e)}")
requirements.txt ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.9.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ beautifulsoup4==4.13.4
5
+ cachetools==5.5.2
6
+ certifi==2025.7.14
7
+ cffi==1.17.1
8
+ charset-normalizer==3.4.2
9
+ click==8.2.1
10
+ colorama==0.4.6
11
+ cryptography==45.0.5
12
+ dnspython==2.7.0
13
+ dotenv==0.9.9
14
+ ecdsa==0.19.1
15
+ email_validator==2.2.0
16
+ exceptiongroup==1.3.0
17
+ fastapi==0.116.1
18
+ fastapi-cli==0.0.8
19
+ fastapi-cloud-cli==0.1.4
20
+ filelock==3.13.1
21
+ fsspec==2024.6.1
22
+ google-ai-generativelanguage==0.6.15
23
+ google-api-core==2.25.1
24
+ google-api-python-client==2.177.0
25
+ google-auth==2.40.3
26
+ google-auth-httplib2==0.2.0
27
+ google-generativeai==0.8.5
28
+ googleapis-common-protos==1.70.0
29
+ grpcio==1.74.0
30
+ grpcio-status==1.71.2
31
+ h11==0.16.0
32
+ httpcore==1.0.9
33
+ httplib2==0.22.0
34
+ httptools==0.6.4
35
+ httpx==0.28.1
36
+ huggingface-hub==0.34.3
37
+ idna==3.10
38
+ itsdangerous==2.2.0
39
+ Jinja2==3.1.6
40
+ langdetect==1.0.9
41
+ markdown-it-py==3.0.0
42
+ MarkupSafe==2.1.5
43
+ mdurl==0.1.2
44
+ mpmath==1.3.0
45
+ networkx==3.3
46
+ numpy==2.1.2
47
+ orjson==3.11.0
48
+ packaging==25.0
49
+ pillow==11.0.0
50
+ proto-plus==1.26.1
51
+ protobuf==5.29.5
52
+ psutil==7.0.0
53
+ pyasn1==0.6.1
54
+ pyasn1_modules==0.4.2
55
+ pycparser==2.22
56
+ pydantic==2.11.7
57
+ pydantic-extra-types==2.10.5
58
+ pydantic-settings==2.10.1
59
+ pydantic_core==2.33.2
60
+ Pygments==2.19.2
61
+ PyMuPDF==1.26.3
62
+ pyparsing==3.2.3
63
+ python-dotenv==1.1.1
64
+ python-jose==3.5.0
65
+ python-multipart==0.0.20
66
+ PyYAML==6.0.2
67
+ regex==2025.7.31
68
+ requests==2.32.4
69
+ rich==14.0.0
70
+ rich-toolkit==0.14.8
71
+ rignore==0.6.4
72
+ rsa==4.9.1
73
+ safetensors==0.5.3
74
+ sentry-sdk==2.33.2
75
+ shellingham==1.5.4
76
+ six==1.17.0
77
+ sniffio==1.3.1
78
+ soupsieve==2.7
79
+ starlette==0.47.2
80
+ sympy==1.13.3
81
+ tokenizers==0.21.4
82
+ torch==2.7.1+cu126
83
+ torchaudio==2.7.1+cu126
84
+ torchvision==0.22.1+cu126
85
+ tqdm==4.67.1
86
+ transformers==4.54.1
87
+ typer==0.16.0
88
+ typing-inspection==0.4.1
89
+ typing_extensions==4.12.2
90
+ ujson==5.10.0
91
+ uritemplate==4.2.0
92
+ urllib3==2.5.0
93
+ uvicorn==0.35.0
94
+ watchfiles==1.1.0
95
+ websockets==15.0.1
96
+ langextract
tools/TOOLS_README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Test Summary Tool:
2
+ ```bash
3
+ curl -X POST "http://localhost:8080/api/analyze_html" -F "[email protected];type=text/html"
4
+ ```
5
+
6
+ response:
7
+ ```json
8
+ {"summary":"This payslip shows earnings including base salary, allowances, and overtime pay, with deductions for advances, loans, and CPF, resulting in a net pay of 2363.40 or 4213.40. CPF contributions are also detailed.","earnings":["基 本工资 (Basic Salary): 1800.00","总加班费 (Total Overtime): 368.16","[加班1.5倍] (Overtime 1.5x): 141.60","[加班2.0倍] (Overtime 2.0x): 226.56","住宿补贴 (Housing Allowance): 450.00","特别津贴 (Special Allowance): 100.00","交通津贴 (Transport Allowance): 300.00","雇主公积金 (Employer CPF): 180.00"],"deductions":["第一周预支 (First Week Advance): -300.00","员工贷款 (Employee Loan): -80.00","CDAC: -1.00","员工公积金 (Employee CPF): -191.00","总无薪假 (Total Unpaid Leave): -82.76"],"additional_info":{"gross_pay":["2935.40","4935.40"],"net_pay":["2363.40","4213.40"],"cpf_salary":["2555.40","4555.40"],"total_cpf":["371.00","661.00"],"annual_leave":{"used":"1.00","balance":"48.00"},"medical_leave":{"used":"0.00","balance":"14.00"},"bank_details":{"bank":"华侨银行 (OCBC Bank)","account_number":"151179932"}}}
9
+ ```
10
+
11
+ Test Keyword Tool:
12
+
13
+ ```bash
14
+ curl -X POST "http://localhost:8080/api/verify_document_keywords" -F "[email protected];type=text/html" -F "analysis_type=legality" -F "search_context=Singapore employment law"
15
+
16
+ ```
17
+
18
+ reponse:
19
+ ```json
20
+ {"analysis_configuration":{"analysis_type":"legality","search_context":"Singapore employment law"},"verification_results":[{"claim":"基本工资 1800.00","summary":"Claim states a base salary of 1800.00. No evidence to verify.","status":"Needs Manual Review"},{"claim":"[ 加班1. 5倍 ] 10. 00小时 x $ 14. 16 = 141. 60","summary":"Claim states overtime pay at 1.5x rate for 10 hours. The calculation needs verification. No evidence to verify the hourly rate or overtime policy.","status":"Needs Manual Review"},{"claim":"[ 加班2. 0倍 ] 12. 00小时 x $ 18. 88 = 226. 56","summary":"Claim states overtime pay at 2.0x rate for 12 hours. The calculation needs verification. No evidence to verify the hourly rate or overtime policy.","status":"Needs Manual Review"},{"claim":"员工公积金 - 191. 00","summary":"Claim states a deduction of 191.00 for employee housing fund. No evidence to verify the legality or accuracy.","status":"Needs Manual Review"},{"claim":"年假 / 已用 / 余额 : 1. 00 / 1. 00 / 48. 00","summary":"Claim states vacation leave information. No evidence to verify the accuracy or legality of the leave policy.","status":"Needs Manual Review"}]}
21
+ ```
tools/__pycache__/tools.cpython-310.pyc ADDED
Binary file (9.67 kB). View file
 
tools/extraction_results.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"extractions": [{"extraction_class": "character", "extraction_text": "Lady Juliet", "char_interval": {"start_pos": 0, "end_pos": 11}, "alignment_status": "match_exact", "extraction_index": 1, "group_index": 0, "description": null, "attributes": {"emotional_state": "longing"}}, {"extraction_class": "emotion", "extraction_text": "heart aching", "char_interval": {"start_pos": 46, "end_pos": 58}, "alignment_status": "match_exact", "extraction_index": 2, "group_index": 1, "description": null, "attributes": {"feeling": "ache"}}, {"extraction_class": "relationship", "extraction_text": "for Romeo", "char_interval": {"start_pos": 59, "end_pos": 68}, "alignment_status": "match_exact", "extraction_index": 3, "group_index": 2, "description": null, "attributes": {"type": "love"}}], "text": "Lady Juliet gazed longingly at the stars, her heart aching for Romeo", "document_id": "doc_211712b3"}
tools/langextract_tool.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import langextract as lx
3
+ import textwrap
4
+ from dotenv import load_dotenv
5
+
6
+ # Step 1: Load environment variables from a .env file
7
+ load_dotenv()
8
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
9
+ os.environ["LANGEXTRACT_API_KEY"] = GEMINI_API_KEY
10
+
11
+ # 1. Define the prompt and extraction rules
12
+ prompt = textwrap.dedent(
13
+ """\
14
+ Extract characters, emotions, and relationships in order of appearance.
15
+ Use exact text for extractions. Do not paraphrase or overlap entities.
16
+ Provide meaningful attributes for each entity to add context."""
17
+ )
18
+
19
+ # 2. Provide a high-quality example to guide the model
20
+ examples = [
21
+ lx.data.ExampleData(
22
+ text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
23
+ extractions=[
24
+ lx.data.Extraction(
25
+ extraction_class="character",
26
+ extraction_text="ROMEO",
27
+ attributes={"emotional_state": "wonder"},
28
+ ),
29
+ lx.data.Extraction(
30
+ extraction_class="emotion",
31
+ extraction_text="But soft!",
32
+ attributes={"feeling": "gentle awe"},
33
+ ),
34
+ lx.data.Extraction(
35
+ extraction_class="relationship",
36
+ extraction_text="Juliet is the sun",
37
+ attributes={"type": "metaphor"},
38
+ ),
39
+ ],
40
+ )
41
+ ]
42
+
43
+ # The input text to be processed
44
+ input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"
45
+
46
+ # Run the extraction
47
+ result = lx.extract(
48
+ text_or_documents=input_text,
49
+ prompt_description=prompt,
50
+ examples=examples,
51
+ model_id="gemini-2.5-flash",
52
+ )
53
+
54
+ # Save the results to a JSONL file
55
+ lx.io.save_annotated_documents(
56
+ [result], output_name="extraction_results.jsonl", output_dir="."
57
+ )
58
+
59
+ # Generate the visualization from the file
60
+ html_content = lx.visualize("extraction_results.jsonl")
61
+ with open("visualization.html", "w", encoding="utf-8") as f:
62
+ f.write(html_content)
tools/tools.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ./tools/tools.py
2
+
3
+ import os
4
+ import json
5
+ import logging
6
+ import asyncio
7
+ import itertools
8
+ from functools import partial
9
+ from concurrent.futures import ThreadPoolExecutor
10
+
11
+ import google.generativeai as genai
12
+ from google.api_core import exceptions as google_exceptions
13
+ from googleapiclient.discovery import build
14
+ from bs4 import BeautifulSoup
15
+ from dotenv import load_dotenv
16
+
17
+ # Step 1: Load environment variables from a .env file
18
+ load_dotenv()
19
+
20
+ # Configure a logger for the tool
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Step 2: Configure the Gemini API key right after loading it.
25
+ # This is the crucial fix.
26
+ try:
27
+ api_key = os.getenv("GEMINI_API_KEY")
28
+ if not api_key:
29
+ raise ValueError("GEMINI_API_KEY not found in environment variables.")
30
+ genai.configure(api_key=api_key)
31
+ except (ValueError, TypeError) as e:
32
+ # This will print a clear warning if the server starts without a key.
33
+ print(f"WARNING: Gemini API not configured. Tool will fail. Reason: {e}")
34
+
35
+
36
+ def extract_text_from_html(html_content: str) -> str:
37
+ """
38
+ Parses an HTML string and extracts all human-readable text from the body.
39
+ """
40
+ if not html_content:
41
+ return ""
42
+
43
+ soup = BeautifulSoup(html_content, "html.parser")
44
+
45
+ for script_or_style in soup(["script", "style"]):
46
+ script_or_style.decompose()
47
+
48
+ text = soup.get_text(separator=" ", strip=True)
49
+ return text
50
+
51
+
52
+ async def generate_document_insights(document_text: str) -> dict:
53
+ """
54
+ Analyzes a string of text using Gemini to provide a summary and key points.
55
+ """
56
+ try:
57
+ if not document_text.strip():
58
+ return {
59
+ "error": "Could not extract any meaningful text from the provided content."
60
+ }
61
+
62
+ # CORRECTED MODEL: Using gemini-1.5-flash, a powerful and efficient model.
63
+ model = genai.GenerativeModel(model_name="gemini-2.5-flash")
64
+
65
+ prompt = f"""
66
+ You are an expert financial analyst who specializes in interpreting payslips and financial documents.
67
+ Based on the text below, which was extracted from a payslip, perform two tasks:
68
+ 1. **Summarize**: Create a concise, one-sentence summary of the payslip, focusing on the final net pay.
69
+ 2. **Extract Key Figures**: Identify and list the most important financial figures as bullet points. Categorize them into "Earnings," and "Deductions."
70
+
71
+ **Document Text:**
72
+ ---
73
+ {document_text}
74
+ ---
75
+
76
+ Please format your response as a valid JSON object with three keys: "summary" (a string), "earnings" (an array of strings), and "deductions" (an array of strings).
77
+
78
+ Example Format:
79
+ {{
80
+ "summary": "This payslip shows a net pay of [Net Pay Amount] after calculating total earnings and deductions.",
81
+ "earnings": [
82
+ "Basic Salary: 1800.00",
83
+ "Total Overtime: 368.16",
84
+ "Housing Allowance: 450.00"
85
+ ],
86
+ "deductions": [
87
+ "Advance (Week 1): -300.00",
88
+ "Employee Loan: -80.00",
89
+ "Employee CPF: -191.00"
90
+ ]
91
+ }}
92
+ """
93
+
94
+ response = await model.generate_content_async(prompt)
95
+
96
+ cleaned_response_text = (
97
+ response.text.strip().replace("```json", "").replace("```", "").strip()
98
+ )
99
+
100
+ insights = json.loads(cleaned_response_text)
101
+ return insights
102
+
103
+ except google_exceptions.ResourceExhausted as e:
104
+ return {
105
+ "error": f"Gemini API quota exceeded. Please try again later. Details: {e}"
106
+ }
107
+ except json.JSONDecodeError:
108
+ return {
109
+ "summary": "Could not parse the AI's response.",
110
+ "key_points": [response.text],
111
+ }
112
+ except Exception as e:
113
+ # This will now catch the ValueError from the configuration step if the key is missing.
114
+ return {
115
+ "error": f"An unexpected error occurred during document analysis: {str(e)}"
116
+ }
117
+
118
+
119
+ def _execute_single_google_search(query: str, max_results: int) -> list[dict]:
120
+ """(Internal Helper) Performs a single synchronous web search using Google."""
121
+ logger.info(f"Executing web search for query: '{query}'...")
122
+ try:
123
+ # ADAPTATION: Fetch keys directly from environment variables.
124
+ api_key = os.getenv("GEMINI_API_KEY")
125
+ cse_id = os.getenv("GOOGLE_CSE_ID")
126
+
127
+ if not api_key or not cse_id:
128
+ raise ValueError(
129
+ "GEMINI_API_KEY and GOOGLE_CSE_ID must be set in the environment."
130
+ )
131
+
132
+ service = build("customsearch", "v1", developerKey=api_key)
133
+
134
+ params = {"q": query, "cx": cse_id, "num": max_results}
135
+
136
+ res = service.cse().list(**params).execute()
137
+
138
+ search_items = res.get("items", [])
139
+ # MODIFICATION: Added 'snippet' for better RAG context.
140
+ results = [
141
+ {
142
+ "title": item.get("title", "Untitled"),
143
+ "href": item.get("link"),
144
+ "snippet": item.get("snippet", "No snippet available."),
145
+ }
146
+ for item in search_items
147
+ if item.get("link")
148
+ ]
149
+ logger.info(f"Found {len(results)} web results for query: '{query}'")
150
+ return results
151
+ except Exception as e:
152
+ logger.error(f"An error occurred during web search for '{query}': {e}")
153
+ return []
154
+
155
+
156
+ async def perform_searches_and_get_hits(
157
+ queries: list[str], executor: ThreadPoolExecutor, max_results_per_query: int = 3
158
+ ) -> list[dict]:
159
+ """Asynchronously runs multiple Google searches and returns a de-duplicated list of hits."""
160
+ if not queries:
161
+ return []
162
+ logger.info(f"\n--- Starting concurrent web search for {len(queries)} queries ---")
163
+ loop = asyncio.get_running_loop()
164
+
165
+ # ADAPTATION: Removed settings dependency.
166
+ search_tasks = [
167
+ partial(_execute_single_google_search, query, max_results_per_query)
168
+ for query in queries
169
+ ]
170
+ search_coroutines = [loop.run_in_executor(executor, task) for task in search_tasks]
171
+ list_of_hit_lists = await asyncio.gather(*search_coroutines)
172
+
173
+ unique_hits = {
174
+ hit["href"]: hit for hit in itertools.chain.from_iterable(list_of_hit_lists)
175
+ }
176
+
177
+ final_hits = list(unique_hits.values())
178
+ logger.info(
179
+ f"--- Web search complete. Found {len(final_hits)} unique items in total. ---"
180
+ )
181
+ return final_hits
182
+
183
+
184
+ # --- MODIFIED: Keyword Analysis Tool now uses the full search pipeline ---
185
+
186
+
187
+ async def analyze_keywords_with_web_search(document_text: str, config: dict) -> dict:
188
+ """
189
+ Analyzes and verifies keywords using a two-agent RAG process with
190
+ efficient batching for verification to avoid rate limits.
191
+ """
192
+ try:
193
+ model = genai.GenerativeModel(model_name="gemini-2.0-flash")
194
+ analysis_type = config.get("analysis_type", "accuracy")
195
+ search_context = config.get("search_context", "public records")
196
+
197
+ # --- Agent 1: Keyword/Claim Extraction (1 API Call) ---
198
+ logger.info("Agent 1: Extracting keywords from document...")
199
+ keyword_extraction_prompt = f"""
200
+ You are an expert analyst specializing in document verification. Based on the document text below,
201
+ identify and extract up to 5 critical keywords, figures, or claims that must be verified for {analysis_type}
202
+ within the context of "{search_context}".
203
+
204
+ Focus on terms that are verifiable against external sources.
205
+ Return your findings as a valid JSON array of strings.
206
+
207
+ Document Text:
208
+ ---
209
+ {document_text}
210
+ ---
211
+ """
212
+ response_agent1 = await model.generate_content_async(keyword_extraction_prompt)
213
+ cleaned_agent1_response = (
214
+ response_agent1.text.strip()
215
+ .replace("```json", "")
216
+ .replace("```", "")
217
+ .strip()
218
+ )
219
+
220
+ try:
221
+ keywords_to_verify = json.loads(cleaned_agent1_response)
222
+ except json.JSONDecodeError:
223
+ return {
224
+ "error": "Agent 1 (Keyword Extractor) failed to return valid JSON.",
225
+ "raw_response": cleaned_agent1_response,
226
+ }
227
+
228
+ if not keywords_to_verify:
229
+ return {
230
+ "message": "No keywords were identified for verification.",
231
+ "verification_results": [],
232
+ }
233
+
234
+ logger.info(
235
+ f"Agent 1 found {len(keywords_to_verify)} keywords: {keywords_to_verify}"
236
+ )
237
+
238
+ # --- (Optional) Polite Delay ---
239
+ # A small pause between the two main API calls. Not strictly needed for rate
240
+ # limiting anymore, but can be good practice.
241
+ await asyncio.sleep(2)
242
+
243
+ # --- Live Web Search (No API Calls to Gemini) ---
244
+ dork_queries = [
245
+ f'"{keyword}" AND "{search_context}"' for keyword in keywords_to_verify
246
+ ]
247
+ with ThreadPoolExecutor() as executor:
248
+ all_search_hits = await perform_searches_and_get_hits(
249
+ dork_queries, executor
250
+ )
251
+
252
+ # --- Agent 2: Batch Verification (1 API Call for all keywords) ---
253
+ logger.info("Agent 2: Starting batch verification for all keywords...")
254
+
255
+ # Step 1: Prepare the evidence for each claim
256
+ verification_items_for_prompt = []
257
+ for keyword in keywords_to_verify:
258
+ relevant_hits = [
259
+ hit
260
+ for hit in all_search_hits
261
+ if keyword.lower() in hit.get("title", "").lower()
262
+ or keyword.lower() in hit.get("snippet", "").lower()
263
+ ]
264
+ web_snippets = (
265
+ "\n".join([f"- {hit['snippet']}" for hit in relevant_hits[:3]])
266
+ if relevant_hits
267
+ else "No specific information found on the web."
268
+ )
269
+
270
+ # Create a formatted block for each item to be verified
271
+ item_block = f'Claim: "{keyword}"\n' f"Evidence:\n{web_snippets}\n" f"---"
272
+ verification_items_for_prompt.append(item_block)
273
+
274
+ # Step 2: Create a single, powerful batch prompt
275
+ batch_verification_prompt = f"""
276
+ You are a verification agent. For EACH of the following claims, assess its {analysis_type} based ONLY on the provided evidence.
277
+ Your response MUST be a valid JSON array, where each object has three keys: "claim", "summary", and "status".
278
+ The status must be one of: "Verified", "Contradicted", or "Needs Manual Review".
279
+
280
+ Here are the claims to verify:
281
+
282
+ {''.join(verification_items_for_prompt)}
283
+
284
+ Provide only the JSON array as your final answer. Do not include markdown backticks.
285
+ """
286
+
287
+ # Step 3: Make a single API call for all verifications
288
+ response_agent2 = await model.generate_content_async(batch_verification_prompt)
289
+ cleaned_agent2_response = (
290
+ response_agent2.text.strip()
291
+ .replace("```json", "")
292
+ .replace("```", "")
293
+ .strip()
294
+ )
295
+
296
+ # Step 4: Parse the batch response
297
+ try:
298
+ verification_results = json.loads(cleaned_agent2_response)
299
+ except json.JSONDecodeError:
300
+ logger.error(
301
+ f"Agent 2 (Verifier) failed to return valid JSON in batch mode. Raw response: {cleaned_agent2_response}"
302
+ )
303
+ return {
304
+ "error": "Agent 2 (Verifier) failed to return valid JSON in batch mode.",
305
+ "raw_response": cleaned_agent2_response,
306
+ }
307
+
308
+ logger.info("Agent 2: Batch verification complete.")
309
+ return {
310
+ "analysis_configuration": config,
311
+ "verification_results": verification_results,
312
+ }
313
+
314
+ except Exception as e:
315
+ logger.error(
316
+ f"An unexpected error occurred in the keyword analysis tool: {str(e)}",
317
+ exc_info=True,
318
+ )
319
+ return {
320
+ "error": f"An unexpected error occurred in the keyword analysis tool: {str(e)}"
321
+ }
tools/visualization.html ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <style>
2
+ .lx-highlight { position: relative; border-radius:3px; padding:1px 2px;}
3
+ .lx-highlight .lx-tooltip {
4
+ visibility: hidden;
5
+ opacity: 0;
6
+ transition: opacity 0.2s ease-in-out;
7
+ background: #333;
8
+ color: #fff;
9
+ text-align: left;
10
+ border-radius: 4px;
11
+ padding: 6px 8px;
12
+ position: absolute;
13
+ z-index: 1000;
14
+ bottom: 125%;
15
+ left: 50%;
16
+ transform: translateX(-50%);
17
+ font-size: 12px;
18
+ max-width: 240px;
19
+ white-space: normal;
20
+ box-shadow: 0 2px 6px rgba(0,0,0,0.3);
21
+ }
22
+ .lx-highlight:hover .lx-tooltip { visibility: visible; opacity:1; }
23
+ .lx-animated-wrapper { max-width: 100%; font-family: Arial, sans-serif; }
24
+ .lx-controls {
25
+ background: #fafafa; border: 1px solid #90caf9; border-radius: 8px;
26
+ padding: 12px; margin-bottom: 16px;
27
+ }
28
+ .lx-button-row {
29
+ display: flex; justify-content: center; gap: 8px; margin-bottom: 12px;
30
+ }
31
+ .lx-control-btn {
32
+ background: #4285f4; color: white; border: none; border-radius: 4px;
33
+ padding: 8px 16px; cursor: pointer; font-size: 13px; font-weight: 500;
34
+ transition: background-color 0.2s;
35
+ }
36
+ .lx-control-btn:hover { background: #3367d6; }
37
+ .lx-progress-container {
38
+ margin-bottom: 8px;
39
+ }
40
+ .lx-progress-slider {
41
+ width: 100%; margin: 0; appearance: none; height: 6px;
42
+ background: #ddd; border-radius: 3px; outline: none;
43
+ }
44
+ .lx-progress-slider::-webkit-slider-thumb {
45
+ appearance: none; width: 18px; height: 18px; background: #4285f4;
46
+ border-radius: 50%; cursor: pointer;
47
+ }
48
+ .lx-progress-slider::-moz-range-thumb {
49
+ width: 18px; height: 18px; background: #4285f4; border-radius: 50%;
50
+ cursor: pointer; border: none;
51
+ }
52
+ .lx-status-text {
53
+ text-align: center; font-size: 12px; color: #666; margin-top: 4px;
54
+ }
55
+ .lx-text-window {
56
+ font-family: monospace; white-space: pre-wrap; border: 1px solid #90caf9;
57
+ padding: 12px; max-height: 260px; overflow-y: auto; margin-bottom: 12px;
58
+ line-height: 1.6;
59
+ }
60
+ .lx-attributes-panel {
61
+ background: #fafafa; border: 1px solid #90caf9; border-radius: 6px;
62
+ padding: 8px 10px; margin-top: 8px; font-size: 13px;
63
+ }
64
+ .lx-current-highlight {
65
+ border-bottom: 4px solid #ff4444;
66
+ font-weight: bold;
67
+ animation: lx-pulse 1s ease-in-out;
68
+ }
69
+ @keyframes lx-pulse {
70
+ 0% { text-decoration-color: #ff4444; }
71
+ 50% { text-decoration-color: #ff0000; }
72
+ 100% { text-decoration-color: #ff4444; }
73
+ }
74
+ .lx-legend {
75
+ font-size: 12px; margin-bottom: 8px;
76
+ padding-bottom: 8px; border-bottom: 1px solid #e0e0e0;
77
+ }
78
+ .lx-label {
79
+ display: inline-block;
80
+ padding: 2px 4px;
81
+ border-radius: 3px;
82
+ margin-right: 4px;
83
+ color: #000;
84
+ }
85
+ .lx-attr-key {
86
+ font-weight: 600;
87
+ color: #1565c0;
88
+ letter-spacing: 0.3px;
89
+ }
90
+ .lx-attr-value {
91
+ font-weight: 400;
92
+ opacity: 0.85;
93
+ letter-spacing: 0.2px;
94
+ }
95
+
96
+ /* Add optimizations with larger fonts and better readability for GIFs */
97
+ .lx-gif-optimized .lx-text-window { font-size: 16px; line-height: 1.8; }
98
+ .lx-gif-optimized .lx-attributes-panel { font-size: 15px; }
99
+ .lx-gif-optimized .lx-current-highlight { text-decoration-thickness: 4px; }
100
+ </style>
101
+ <div class="lx-animated-wrapper lx-gif-optimized">
102
+ <div class="lx-attributes-panel">
103
+ <div class="lx-legend">Highlights Legend: <span class="lx-label" style="background-color:#D2E3FC;">character</span> <span class="lx-label" style="background-color:#C8E6C9;">emotion</span> <span class="lx-label" style="background-color:#FEF0C3;">relationship</span></div>
104
+ <div id="attributesContainer"></div>
105
+ </div>
106
+ <div class="lx-text-window" id="textWindow">
107
+ <span class="lx-highlight lx-current-highlight" data-idx="0" style="background-color:#D2E3FC;">Lady Juliet</span> gazed longingly at the stars, her <span class="lx-highlight" data-idx="1" style="background-color:#C8E6C9;">heart aching</span> <span class="lx-highlight" data-idx="2" style="background-color:#FEF0C3;">for Romeo</span>
108
+ </div>
109
+ <div class="lx-controls">
110
+ <div class="lx-button-row">
111
+ <button class="lx-control-btn" onclick="playPause()">▶️ Play</button>
112
+ <button class="lx-control-btn" onclick="prevExtraction()">⏮ Previous</button>
113
+ <button class="lx-control-btn" onclick="nextExtraction()">⏭ Next</button>
114
+ </div>
115
+ <div class="lx-progress-container">
116
+ <input type="range" id="progressSlider" class="lx-progress-slider"
117
+ min="0" max="2" value="0"
118
+ onchange="jumpToExtraction(this.value)">
119
+ </div>
120
+ <div class="lx-status-text">
121
+ Entity <span id="entityInfo">1/3</span> |
122
+ Pos <span id="posInfo">[0-11]</span>
123
+ </div>
124
+ </div>
125
+ </div>
126
+
127
+ <script>
128
+ (function() {
129
+ const extractions = [{"index": 0, "class": "character", "text": "Lady Juliet", "color": "#D2E3FC", "startPos": 0, "endPos": 11, "beforeText": "", "extractionText": "Lady Juliet", "afterText": " gazed longingly at the stars, her heart aching for Romeo", "attributesHtml": "<div><strong>class:</strong> character</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">emotional_state</span>: <span class=\"lx-attr-value\">longing</span>}</div>"}, {"index": 1, "class": "emotion", "text": "heart aching", "color": "#C8E6C9", "startPos": 46, "endPos": 58, "beforeText": "Lady Juliet gazed longingly at the stars, her ", "extractionText": "heart aching", "afterText": " for Romeo", "attributesHtml": "<div><strong>class:</strong> emotion</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">feeling</span>: <span class=\"lx-attr-value\">ache</span>}</div>"}, {"index": 2, "class": "relationship", "text": "for Romeo", "color": "#FEF0C3", "startPos": 59, "endPos": 68, "beforeText": "Lady Juliet gazed longingly at the stars, her heart aching ", "extractionText": "for Romeo", "afterText": "", "attributesHtml": "<div><strong>class:</strong> relationship</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">type</span>: <span class=\"lx-attr-value\">love</span>}</div>"}];
130
+ let currentIndex = 0;
131
+ let isPlaying = false;
132
+ let animationInterval = null;
133
+ let animationSpeed = 1.0;
134
+
135
+ function updateDisplay() {
136
+ const extraction = extractions[currentIndex];
137
+ if (!extraction) return;
138
+
139
+ document.getElementById('attributesContainer').innerHTML = extraction.attributesHtml;
140
+ document.getElementById('entityInfo').textContent = (currentIndex + 1) + '/' + extractions.length;
141
+ document.getElementById('posInfo').textContent = '[' + extraction.startPos + '-' + extraction.endPos + ']';
142
+ document.getElementById('progressSlider').value = currentIndex;
143
+
144
+ const playBtn = document.querySelector('.lx-control-btn');
145
+ if (playBtn) playBtn.textContent = isPlaying ? '⏸ Pause' : '▶️ Play';
146
+
147
+ const prevHighlight = document.querySelector('.lx-text-window .lx-current-highlight');
148
+ if (prevHighlight) prevHighlight.classList.remove('lx-current-highlight');
149
+ const currentSpan = document.querySelector('.lx-text-window span[data-idx="' + currentIndex + '"]');
150
+ if (currentSpan) {
151
+ currentSpan.classList.add('lx-current-highlight');
152
+ currentSpan.scrollIntoView({block: 'center', behavior: 'smooth'});
153
+ }
154
+ }
155
+
156
+ function nextExtraction() {
157
+ currentIndex = (currentIndex + 1) % extractions.length;
158
+ updateDisplay();
159
+ }
160
+
161
+ function prevExtraction() {
162
+ currentIndex = (currentIndex - 1 + extractions.length) % extractions.length;
163
+ updateDisplay();
164
+ }
165
+
166
+ function jumpToExtraction(index) {
167
+ currentIndex = parseInt(index);
168
+ updateDisplay();
169
+ }
170
+
171
+ function playPause() {
172
+ if (isPlaying) {
173
+ clearInterval(animationInterval);
174
+ isPlaying = false;
175
+ } else {
176
+ animationInterval = setInterval(nextExtraction, animationSpeed * 1000);
177
+ isPlaying = true;
178
+ }
179
+ updateDisplay();
180
+ }
181
+
182
+ window.playPause = playPause;
183
+ window.nextExtraction = nextExtraction;
184
+ window.prevExtraction = prevExtraction;
185
+ window.jumpToExtraction = jumpToExtraction;
186
+
187
+ updateDisplay();
188
+ })();
189
+ </script>