Spaces:

kevansoon
/

backend

Sleeping

App Files Files Community

KevanSoon commited on Aug 8

Commit

f147852

1 Parent(s): 89ca815

first project init

Browse files

Files changed (10) hide show

app.py +610 -0
auth/__pycache__/clerk.cpython-310.pyc +0 -0
auth/clerk.py +32 -0
requirements.txt +96 -0
tools/TOOLS_README.md +21 -0
tools/__pycache__/tools.cpython-310.pyc +0 -0
tools/extraction_results.jsonl +1 -0
tools/langextract_tool.py +62 -0
tools/tools.py +321 -0
tools/visualization.html +189 -0

app.py ADDED Viewed

	@@ -0,0 +1,610 @@

+import base64
+import json
+import asyncio
+import re
+import os
+import html
+import requests
+import httpx
+import uuid
+from fastapi import FastAPI, File, Form, UploadFile, HTTPException, Request, Header
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse
+from fastapi import Depends
+from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
+from pydantic import BaseModel
+from requests.exceptions import RequestException
+from dotenv import load_dotenv
+import google.generativeai as genai
+from google.api_core import exceptions as google_exceptions
+from pydantic import BaseModel
+from auth.clerk import verify_clerk_jwt
+from tools.tools import extract_text_from_html, generate_document_insights, analyze_keywords_with_web_search
+security = HTTPBearer()
+# Load environment variables from a .env file
+load_dotenv()
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+app = FastAPI(
+    title="Document Translator (Final Architecture)",
+    description="Pipeline: Nemo (JSON) -> Sea-Lion (Translate JSON) -> Gemini (HTML)",
+    version="10.0.1", # Final Architecture, patched
+)
+# Allow requests from the default React frontend port
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:3000"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def wrap_words_with_spans(html: str) -> str:
+    # Wrap each word in target tags with a span having data attributes
+    def replacer(match):
+        replacer.counter += 1
+        word = match.group(0)
+        return f'<span data-clickable="true" data-id="word-{replacer.counter}">{word}</span>'
+    replacer.counter = 0
+    pattern = r'\b\w+[.,?!]?\b'  # matches words with optional trailing punctuation
+    for tag in ['p', 'h1', 'h2', 'td']:
+        # regex to capture content inside these tags
+        regex = re.compile(fr'(<{tag}[^>]*>)(.*?)(</{tag}>)', re.DOTALL)
+        def replacer_func(m):
+            open_tag, inner_text, close_tag = m.groups()
+            wrapped_text = re.sub(pattern, replacer, inner_text)
+            return open_tag + wrapped_text + close_tag
+        html = regex.sub(replacer_func, html)
+    return html
+def inject_dropdown_script(html: str) -> str:
+    script = """
+<script>
+window.addEventListener('DOMContentLoaded', () => {
+  function createDropdown(x, y, wordEl, word) {
+    // Remove any existing dropdown
+    const oldDropdown = document.getElementById('translation-dropdown');
+    if (oldDropdown) oldDropdown.remove();
+    // Create dropdown select element
+    const dropdown = document.createElement('select');
+    dropdown.id = 'translation-dropdown';
+    dropdown.style.position = 'absolute';
+    dropdown.style.left = x + 'px';
+    dropdown.style.top = y + 'px';
+    dropdown.style.zIndex = 9999;
+    // Languages options
+    const languages = ['English', 'Chinese', 'Tamil', 'Hindi'];
+    languages.forEach(lang => {
+      const option = document.createElement('option');
+      option.value = lang.toLowerCase();
+      option.innerText = lang;
+      dropdown.appendChild(option);
+    });
+    // Placeholder option
+    const defaultOption = document.createElement('option');
+    defaultOption.value = '';
+    defaultOption.innerText = 'Select language';
+    defaultOption.selected = true;
+    defaultOption.disabled = true;
+    dropdown.insertBefore(defaultOption, dropdown.firstChild);
+    document.body.appendChild(dropdown);
+    dropdown.focus();
+    dropdown.addEventListener('change', () => {
+      const selectedLang = dropdown.value;
+      if (!selectedLang) return;
+      // Call backend to translate word
+      fetch('http://localhost:8080/api/translate_frontend', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ text: word, target_language: selectedLang }),
+      })
+      .then(res => {
+        if (!res.ok) throw new Error('Translation API error');
+        return res.json();
+      })
+      .then(data => {
+        const translated = data.translated_text || word;
+        wordEl.innerText = translated;
+        // Add or update language label
+        let label = wordEl.nextSibling;
+        if (!label || !label.classList || !label.classList.contains('language-label')) {
+          label = document.createElement('span');
+          label.className = 'language-label';
+          label.style.marginLeft = '6px';
+          label.style.fontSize = '0.8em';
+          label.style.color = '#555';
+          wordEl.after(label);
+        }
+        label.textContent = `(${dropdown.options[dropdown.selectedIndex].text})`;
+      })
+      .catch(err => {
+        console.error('Translation error:', err);
+        alert('Translation failed, please try again.');
+      });
+      dropdown.remove();
+    });
+    // Clicking outside closes dropdown
+    document.addEventListener('click', function onDocClick(e) {
+      if (!dropdown.contains(e.target)) {
+        dropdown.remove();
+        document.removeEventListener('click', onDocClick);
+      }
+    });
+  }
+  // Add click handlers to all words wrapped in spans with data-clickable="true"
+  document.querySelectorAll('span[data-clickable="true"]').forEach(el => {
+    el.style.cursor = 'pointer';
+    el.addEventListener('click', event => {
+      event.stopPropagation();
+      const word = el.innerText;
+      const rect = el.getBoundingClientRect();
+      const x = rect.left + window.scrollX;
+      const y = rect.bottom + window.scrollY;
+      createDropdown(x, y, el, word);
+    });
+  });
+});
+</script>
+"""
+    if "</body>" in html:
+        return html.replace("</body>", script + "\n</body>")
+    else:
+        return html + script
+# Define a Pydantic model to enforce the structure of the incoming request body
+class HtmlAnalysisRequest(BaseModel):
+    html: str
+@app.post("/api/analyze_html")
+async def analyze_html_file(file: UploadFile = File(...)):
+    """
+    Receives an uploaded HTML file, extracts its text content, and uses the
+    Gemini tool to generate a summary and key informational points.
+    """
+    # Check if the uploaded file is an HTML file
+    if file.content_type != "text/html":
+        raise HTTPException(status_code=400, detail="Unsupported file type. Please upload a .html file.")
+    try:
+        # Step 1: Read the content of the uploaded file
+        html_content_bytes = await file.read()
+        html_content = html_content_bytes.decode('utf-8')
+        # Step 2: Extract text from the HTML using our tool
+        document_text = extract_text_from_html(html_content)
+        # Step 3: Get insights from the Gemini tool
+        analysis_results = await generate_document_insights(document_text)
+        # Check if the tool returned a functional error
+        if 'error' in analysis_results:
+            raise HTTPException(status_code=500, detail=analysis_results['error'])
+        return analysis_results
+    except Exception as e:
+        # Catch any other unexpected errors
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {e}")
+@app.post("/api/translate_frontend")
+async def translate_text(request: Request):
+    try:
+        data = await request.json()
+        text = data.get("text")
+        target_language = data.get("target_language")
+        if not text or not target_language:
+            raise HTTPException(status_code=400, detail="Missing 'text' or 'target_language' in request body")
+        url = "https://api.sea-lion.ai/v1/chat/completions"
+        api_key = os.getenv("SEALION_API_KEY")
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json"
+            # No "accept" header or set to "application/json"
+        }
+        prompt = (
+            f"Please translate the following text to {target_language} and return "
+            "ONLY the translated text without any explanations or extra formatting:\n\n"
+            f"\"{text}\""
+        )
+        payload = {
+            "max_completion_tokens": 1024,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT"
+        }
+        response = requests.post(url, headers=headers, data=json.dumps(payload))
+        response.raise_for_status()
+        # Parse JSON response
+        response_json = response.json()
+        # Extract translated text from response JSON
+        translated_text = response_json["choices"][0]["message"]["content"].strip()
+        if not translated_text:
+            raise HTTPException(status_code=500, detail="Empty response from translation model.")
+        return {"translated_text": translated_text}
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=502, detail=f"Translation API request failed: {e}")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
+# --- Model 2: Sea-Lion (The JSON Translator) ---
+@app.post("/api/translate")
+async def translate_text(text: str, target_language: str):
+    """
+    Receives text and a target language, and returns the translated text
+    using the SEA-LION model.
+    """
+    # The API endpoint URL for translation
+    url = "https://api.sea-lion.ai/v1/chat/completions"
+    # It's recommended to store API keys securely, e.g., in environment variables
+    api_key =  os.getenv("SEALION_API_KEY")
+    # The headers for the request
+    headers = {
+        "accept": "text/plain",
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    # Create a dynamic prompt for the translation task
+    prompt = f"Translate the following text to {text}: \"{target_language}\""
+    # The JSON data payload for the request
+    data = {
+        "max_completion_tokens": 4096,  # Increased token limit for longer translations
+        "messages": [
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ],
+        "model": "aisingapore/Llama-SEA-LION-v3-70B-IT"
+    }
+    try:
+        # Make the POST request to the SEA-LION API
+        response = requests.post(url, headers=headers, data=json.dumps(data))
+        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
+        # The response from this specific API is plain text, not JSON.
+        # We will wrap it in a JSON structure for consistency in our API.
+        translated_text = response.text
+        # It's good practice to check if the response is empty
+        if not translated_text:
+             raise HTTPException(status_code=500, detail="Received an empty response from the translation model.")
+        return {"translated_text": translated_text}
+    except requests.exceptions.RequestException as e:
+        # Handle network-related errors
+        raise HTTPException(status_code=502, detail=f"Failed to communicate with the translation AI model: {e}")
+    except Exception as e:
+        # Handle other potential errors
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during translation: {e}")
+# --- Model 3: Gemini (The HTML Generator) ---
+async def generate_html_from_translated_json(translated_json: dict) -> str:
+    """
+    Receives a translated JSON object and uses Gemini to generate the final
+    structured HTML document.
+    """
+    try:
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name='gemini-2.0-flash')
+        json_string_for_prompt = json.dumps(translated_json, indent=2)
+        prompt = f"""
+        You are an expert system that converts a JSON object containing PRE-TRANSLATED text into a clean, semantic HTML document.
+        **Your Task:**
+        1.  Analyze the following JSON object. Its text content has already been translated.
+        2.  The core document data is located at the path: `choices[0]['message']['tool_calls'][0]['function']['arguments']`.
+        3.  The value of 'arguments' is a JSON STRING. You must parse this inner string to access the list of document chunks.
+        4.  Using the translated data from the 'text' fields, generate a single, complete HTML5 document. Use appropriate tags like <h1>, <h2>, <p>, and <table>.
+        5.  if json contains "tabular" means mmake a table for that with some grey border and styling
+        6.  Your final output must ONLY be the raw HTML code. Do not add comments or markdown.
+        **Translated JSON object to process:**
+        ```json
+        {json_string_for_prompt}
+        ```
+        """
+        # def do_request():
+        #     response = model.generate_content(prompt)
+        #     match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
+        #     if match:
+        #         return match.group(1).strip()
+        #     return response.text.strip()
+        # return await asyncio.to_thread(do_request)
+        def do_request():
+            response = model.generate_content(prompt)
+            # Extract raw HTML from Gemini markdown code block
+            match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
+            raw_html = match.group(1).strip() if match else response.text.strip()
+            # Wrap each word in clickable spans
+            wrapped_html = wrap_words_with_spans(raw_html)
+            # Inject dropdown script
+            final_html = inject_dropdown_script(wrapped_html)
+            return final_html
+        return await asyncio.to_thread(do_request)
+    except google_exceptions.ResourceExhausted as e:
+        error_message = "The request to the document processor (Gemini) was rejected due to API quota limits. Please wait or upgrade your API plan."
+        return f"<html><body><h1>API Quota Error</h1><p>{html.escape(error_message)}</p></body></html>"
+    except Exception as e:
+        error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
+        return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
+# --- API Endpoint Orchestrating the Pipeline ---
+@app.post("/api/translate_file", response_class=HTMLResponse)
+async def translate_document_to_raw_html(target_language: str = Form(...), file: UploadFile = File(...)):
+    """
+    Processes a document using the final, robust pipeline:
+    1. Nemo extracts content to JSON.
+    2. Sea-Lion translates the text within the JSON.
+    3. Gemini generates the final HTML from the translated JSON.
+    """
+    content_type = file.content_type
+    if content_type not in ["application/pdf", "image/png", "image/jpeg"]:
+        raise HTTPException(status_code=400, detail="Unsupported file type.")
+    try:
+        # === STEP 1: Get raw JSON from Nemo (The Parser) ===
+        file_content = await file.read()
+        file_b64 = base64.b64encode(file_content).decode("utf-8")
+        nemo_data = {
+            "model": "nvidia/nemoretriever-parse",
+            "messages": [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": f"data:{content_type};base64,{file_b64}"}}]}],
+            "max_tokens": 2048,
+        }
+        headers = {'accept': 'application/json', 'Content-Type': 'application/json'}
+        model_response = requests.post('http://localhost:8000/v1/chat/completions', headers=headers, data=json.dumps(nemo_data))
+        model_response.raise_for_status()
+        nemo_response_json = model_response.json()
+        print(nemo_response_json)
+        print("*********** Step 1 Done ***********")
+        print("*********** Step 2 in Progress ***********")
+        # === STEP 2: Get translated JSON from Sea-Lion (The Translator) ===
+        translated_json = await translate_text(nemo_response_json, target_language)
+        print(translated_json)
+        print("*********** Step 2 Done ***********")
+        print("*********** Step 3 in Progress ***********")
+        # === STEP 3: Generate final HTML from Gemini (The HTML Generator) ===
+        final_html = await generate_html_from_translated_json(translated_json)
+        print(final_html)
+        print("*********** Step 3 Done ***********")
+        # Check if Gemini itself returned an error message
+        if final_html.strip().startswith("<html><body><h1>"):
+            return HTMLResponse(content=final_html)
+        # === STEP 4: Return the final result to the frontend ===
+        return HTMLResponse(content=final_html)
+    except requests.exceptions.RequestException as e:
+        raise HTTPException(status_code=502, detail=f"Failed to communicate with a downstream AI model: {e}")
+    except Exception as e:
+        # This will catch any errors, including the ValueError from the Sea-Lion function
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred during processing: {e}")
+@app.post("/api/verify_document_keywords")
+async def verify_document_keywords(
+    file: UploadFile = File(...),
+    analysis_type: str = Form("legality"),
+    search_context: str = Form("Singapore employment law")
+):
+    """
+    Receives an HTML file and a configuration via form data, then uses the
+    agent-to-agent RAG workflow to identify and verify key claims.
+    """
+    # Check if the uploaded file is an HTML file
+    if file.content_type != "text/html":
+        raise HTTPException(status_code=400, detail="Unsupported file type. Please upload a .html file.")
+    try:
+        # Step 1: Read content from the uploaded file and extract text
+        html_content_bytes = await file.read()
+        html_content = html_content_bytes.decode('utf-8')
+        document_text = extract_text_from_html(html_content)
+        if not document_text.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Could not extract any meaningful text from the provided HTML content."
+            )
+        # Step 2: Prepare the configuration and call the new analysis tool
+        config = {
+            "analysis_type": analysis_type,
+            "search_context": search_context
+        }
+        analysis_results = await analyze_keywords_with_web_search(document_text, config)
+        # Step 3: Handle potential errors from the tool
+        if 'error' in analysis_results:
+            raise HTTPException(status_code=500, detail=analysis_results['error'])
+        # Step 4: Return the successful analysis
+        return analysis_results
+    except Exception as e:
+        # Catch any other unexpected errors during the process
+        raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
+#testing clerk backend authentication
+# @app.post("/upload")
+# async def upload_file(
+#     authorization: str = Header(...),
+#     file: UploadFile = File(...)
+# ):
+#     if not authorization.startswith("Bearer "):
+#         raise HTTPException(status_code=401, detail="Missing Bearer token")
+#     token = authorization.split(" ")[1]
+#     claims = await verify_clerk_jwt(token)
+#     user_id = claims.get("sub")  # Clerk user ID
+#     # ✅ Now the Clerk user is verified
+#     # You can securely store this file, e.g., to Supabase or local
+#     return {"message": f"File uploaded by Clerk user {user_id}"}
+@app.post("/upload")
+async def upload_file(
+    authorization: str = Header(...),
+    file: UploadFile = File(...)
+):
+    if not authorization.startswith("Bearer "):
+        raise HTTPException(status_code=401, detail="Missing Bearer token")
+    token = authorization.split(" ")[1]
+    claims = await verify_clerk_jwt(token)
+    user_id = claims.get("sub")  # Clerk user ID
+    filename = f"{user_id}/{uuid.uuid4()}.png"
+    # Upload to Supabase Storage
+    async with httpx.AsyncClient() as client:
+        upload_resp = await client.post(
+            f"{SUPABASE_URL}/storage/v1/object/user-documents/{filename}",
+            headers={
+                "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
+                "Content-Type": file.content_type,
+            },
+            content=await file.read()
+        )
+    if upload_resp.status_code != 200:
+        raise HTTPException(status_code=500, detail="Failed to upload to Supabase Storage")
+    file_url = f"user-documents/{filename}"
+    # Insert metadata to `documents` table
+    async with httpx.AsyncClient() as client:
+        insert_resp = await client.post(
+            f"{SUPABASE_URL}/rest/v1/documents",
+            headers={
+                "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
+                "apikey": SUPABASE_SERVICE_ROLE_KEY,
+                "Content-Type": "application/json",
+                "Prefer": "return=representation"
+            },
+            json={
+                "user_id": user_id,
+                "filename": filename.split("/")[-1],
+                "file_url": file_url
+            }
+        )
+    if insert_resp.status_code >= 300:
+        raise HTTPException(status_code=500, detail="Failed to insert document metadata")
+    return {"message": f"File uploaded as {filename}"}
+@app.get("/api/documents")
+async def get_user_documents(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    token = credentials.credentials
+    claims = await verify_clerk_jwt(token)
+    user_id = claims.get("sub")
+    if not user_id:
+        raise HTTPException(status_code=401, detail="Invalid user")
+    # Step 1: Get documents from Supabase
+    async with httpx.AsyncClient() as client:
+        resp = await client.get(
+            f"{SUPABASE_URL}/rest/v1/documents?user_id=eq.{user_id}",
+            headers={
+                "apikey": SUPABASE_SERVICE_ROLE_KEY,
+                "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
+                "Accept": "application/json",
+            },
+        )
+    if resp.status_code != 200:
+        raise HTTPException(status_code=500, detail="Failed to fetch documents")
+    documents = resp.json()
+    # Step 2: Get signed URLs for each file
+    async with httpx.AsyncClient() as client:
+        for doc in documents:
+            file_path = doc["file_url"].split("user-documents/", 1)[-1]
+            if not file_path:
+                doc["signed_url"] = None
+                continue
+            signed_url_resp = await client.post(
+                f"{SUPABASE_URL}/storage/v1/object/sign/user-documents/{file_path}",
+                headers={
+                    "apikey": SUPABASE_SERVICE_ROLE_KEY,
+                    "Authorization": f"Bearer {SUPABASE_SERVICE_ROLE_KEY}",
+                    # "Content-Type": "application/json"
+                },
+                json={"expiresIn": 3600},  # 1 hour
+            )
+            if signed_url_resp.status_code == 200:
+                print(f"{SUPABASE_URL}/storage/v1{signed_url_resp.json().get('signedURL')}")
+                doc["signed_url"] = f"{SUPABASE_URL}/storage/v1{signed_url_resp.json().get('signedURL')}"
+            else:
+                doc["signed_url"] = None
+    print(documents)
+    return documents

auth/__pycache__/clerk.cpython-310.pyc ADDED Viewed

Binary file (1.25 kB). View file

auth/clerk.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# auth/clerk.py
+from jose import jwt
+import httpx
+from fastapi import HTTPException
+CLERK_ISSUER = "https://enabling-terrapin-28.clerk.accounts.dev"  # e.g. https://enabling-terrapin-28.clerk.accounts.dev
+CLERK_AUDIENCE = "http://localhost:3000"      # Your frontend origin
+async def verify_clerk_jwt(token: str) -> dict:
+    try:
+        async with httpx.AsyncClient() as client:
+            jwks_url = f"{CLERK_ISSUER}/.well-known/jwks.json"
+            resp = await client.get(jwks_url)
+            jwks = resp.json()["keys"]
+        unverified_header = jwt.get_unverified_header(token)
+        kid = unverified_header.get("kid")
+        key = next((k for k in jwks if k["kid"] == kid), None)
+        if not key:
+            raise HTTPException(status_code=401, detail="Public key not found")
+        payload = jwt.decode(
+            token,
+            key,
+            algorithms=["RS256"],
+            audience=CLERK_AUDIENCE,
+            issuer=CLERK_ISSUER
+        )
+        return payload
+    except Exception as e:
+        raise HTTPException(status_code=401, detail=f"Invalid Clerk JWT: {str(e)}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,96 @@

+accelerate==1.9.0
+annotated-types==0.7.0
+anyio==4.9.0
+beautifulsoup4==4.13.4
+cachetools==5.5.2
+certifi==2025.7.14
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+colorama==0.4.6
+cryptography==45.0.5
+dnspython==2.7.0
+dotenv==0.9.9
+ecdsa==0.19.1
+email_validator==2.2.0
+exceptiongroup==1.3.0
+fastapi==0.116.1
+fastapi-cli==0.0.8
+fastapi-cloud-cli==0.1.4
+filelock==3.13.1
+fsspec==2024.6.1
+google-ai-generativelanguage==0.6.15
+google-api-core==2.25.1
+google-api-python-client==2.177.0
+google-auth==2.40.3
+google-auth-httplib2==0.2.0
+google-generativeai==0.8.5
+googleapis-common-protos==1.70.0
+grpcio==1.74.0
+grpcio-status==1.71.2
+h11==0.16.0
+httpcore==1.0.9
+httplib2==0.22.0
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.34.3
+idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.6
+langdetect==1.0.9
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+numpy==2.1.2
+orjson==3.11.0
+packaging==25.0
+pillow==11.0.0
+proto-plus==1.26.1
+protobuf==5.29.5
+psutil==7.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pycparser==2.22
+pydantic==2.11.7
+pydantic-extra-types==2.10.5
+pydantic-settings==2.10.1
+pydantic_core==2.33.2
+Pygments==2.19.2
+PyMuPDF==1.26.3
+pyparsing==3.2.3
+python-dotenv==1.1.1
+python-jose==3.5.0
+python-multipart==0.0.20
+PyYAML==6.0.2
+regex==2025.7.31
+requests==2.32.4
+rich==14.0.0
+rich-toolkit==0.14.8
+rignore==0.6.4
+rsa==4.9.1
+safetensors==0.5.3
+sentry-sdk==2.33.2
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.7
+starlette==0.47.2
+sympy==1.13.3
+tokenizers==0.21.4
+torch==2.7.1+cu126
+torchaudio==2.7.1+cu126
+torchvision==0.22.1+cu126
+tqdm==4.67.1
+transformers==4.54.1
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.12.2
+ujson==5.10.0
+uritemplate==4.2.0
+urllib3==2.5.0
+uvicorn==0.35.0
+watchfiles==1.1.0
+websockets==15.0.1
+langextract

tools/TOOLS_README.md ADDED Viewed

	@@ -0,0 +1,21 @@

+Test Summary Tool:
+```bash
+curl -X POST "http://localhost:8080/api/analyze_html" -F "[email protected];type=text/html"
+```
+response:
+```json
+{"summary":"This payslip shows earnings including base salary, allowances, and overtime pay, with deductions for advances, loans, and CPF, resulting in a net pay of 2363.40 or 4213.40.  CPF contributions are also detailed.","earnings":["基 本工资 (Basic Salary): 1800.00","总加班费 (Total Overtime): 368.16","[加班1.5倍] (Overtime 1.5x): 141.60","[加班2.0倍] (Overtime 2.0x): 226.56","住宿补贴 (Housing Allowance): 450.00","特别津贴 (Special Allowance): 100.00","交通津贴 (Transport Allowance): 300.00","雇主公积金 (Employer CPF): 180.00"],"deductions":["第一周预支 (First Week Advance): -300.00","员工贷款 (Employee Loan): -80.00","CDAC: -1.00","员工公积金 (Employee CPF): -191.00","总无薪假 (Total Unpaid Leave): -82.76"],"additional_info":{"gross_pay":["2935.40","4935.40"],"net_pay":["2363.40","4213.40"],"cpf_salary":["2555.40","4555.40"],"total_cpf":["371.00","661.00"],"annual_leave":{"used":"1.00","balance":"48.00"},"medical_leave":{"used":"0.00","balance":"14.00"},"bank_details":{"bank":"华侨银行 (OCBC Bank)","account_number":"151179932"}}}
+```
+Test Keyword Tool:
+```bash
+curl -X POST "http://localhost:8080/api/verify_document_keywords" -F "[email protected];type=text/html" -F "analysis_type=legality" -F "search_context=Singapore employment law"
+```
+reponse:
+```json
+{"analysis_configuration":{"analysis_type":"legality","search_context":"Singapore employment law"},"verification_results":[{"claim":"基本工资 1800.00","summary":"Claim states a base salary of 1800.00. No evidence to verify.","status":"Needs Manual Review"},{"claim":"[ 加班1. 5倍 ] 10. 00小时 x $ 14. 16 = 141. 60","summary":"Claim states overtime pay at 1.5x rate for 10 hours. The calculation needs verification. No evidence to verify the hourly rate or overtime policy.","status":"Needs Manual Review"},{"claim":"[ 加班2. 0倍 ] 12. 00小时 x $ 18. 88 = 226. 56","summary":"Claim states overtime pay at 2.0x rate for 12 hours. The calculation needs verification. No evidence to verify the hourly rate or overtime policy.","status":"Needs Manual Review"},{"claim":"员工公积金 - 191. 00","summary":"Claim states a deduction of 191.00 for employee housing fund. No evidence to verify the legality or accuracy.","status":"Needs Manual Review"},{"claim":"年假 / 已用 / 余额 ： 1. 00 / 1. 00 / 48. 00","summary":"Claim states vacation leave information. No evidence to verify the accuracy or legality of the leave policy.","status":"Needs Manual Review"}]}
+```

tools/__pycache__/tools.cpython-310.pyc ADDED Viewed

Binary file (9.67 kB). View file

tools/extraction_results.jsonl ADDED Viewed

	@@ -0,0 +1 @@

+ {"extractions": [{"extraction_class": "character", "extraction_text": "Lady Juliet", "char_interval": {"start_pos": 0, "end_pos": 11}, "alignment_status": "match_exact", "extraction_index": 1, "group_index": 0, "description": null, "attributes": {"emotional_state": "longing"}}, {"extraction_class": "emotion", "extraction_text": "heart aching", "char_interval": {"start_pos": 46, "end_pos": 58}, "alignment_status": "match_exact", "extraction_index": 2, "group_index": 1, "description": null, "attributes": {"feeling": "ache"}}, {"extraction_class": "relationship", "extraction_text": "for Romeo", "char_interval": {"start_pos": 59, "end_pos": 68}, "alignment_status": "match_exact", "extraction_index": 3, "group_index": 2, "description": null, "attributes": {"type": "love"}}], "text": "Lady Juliet gazed longingly at the stars, her heart aching for Romeo", "document_id": "doc_211712b3"}

tools/langextract_tool.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import langextract as lx
+import textwrap
+from dotenv import load_dotenv
+# Step 1: Load environment variables from a .env file
+load_dotenv()
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+os.environ["LANGEXTRACT_API_KEY"] = GEMINI_API_KEY
+# 1. Define the prompt and extraction rules
+prompt = textwrap.dedent(
+    """\
+    Extract characters, emotions, and relationships in order of appearance.
+    Use exact text for extractions. Do not paraphrase or overlap entities.
+    Provide meaningful attributes for each entity to add context."""
+)
+# 2. Provide a high-quality example to guide the model
+examples = [
+    lx.data.ExampleData(
+        text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
+        extractions=[
+            lx.data.Extraction(
+                extraction_class="character",
+                extraction_text="ROMEO",
+                attributes={"emotional_state": "wonder"},
+            ),
+            lx.data.Extraction(
+                extraction_class="emotion",
+                extraction_text="But soft!",
+                attributes={"feeling": "gentle awe"},
+            ),
+            lx.data.Extraction(
+                extraction_class="relationship",
+                extraction_text="Juliet is the sun",
+                attributes={"type": "metaphor"},
+            ),
+        ],
+    )
+]
+# The input text to be processed
+input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"
+# Run the extraction
+result = lx.extract(
+    text_or_documents=input_text,
+    prompt_description=prompt,
+    examples=examples,
+    model_id="gemini-2.5-flash",
+)
+# Save the results to a JSONL file
+lx.io.save_annotated_documents(
+    [result], output_name="extraction_results.jsonl", output_dir="."
+)
+# Generate the visualization from the file
+html_content = lx.visualize("extraction_results.jsonl")
+with open("visualization.html", "w", encoding="utf-8") as f:
+    f.write(html_content)

tools/tools.py ADDED Viewed

	@@ -0,0 +1,321 @@

+# ./tools/tools.py
+import os
+import json
+import logging
+import asyncio
+import itertools
+from functools import partial
+from concurrent.futures import ThreadPoolExecutor
+import google.generativeai as genai
+from google.api_core import exceptions as google_exceptions
+from googleapiclient.discovery import build
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+# Step 1: Load environment variables from a .env file
+load_dotenv()
+# Configure a logger for the tool
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Step 2: Configure the Gemini API key right after loading it.
+# This is the crucial fix.
+try:
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise ValueError("GEMINI_API_KEY not found in environment variables.")
+    genai.configure(api_key=api_key)
+except (ValueError, TypeError) as e:
+    # This will print a clear warning if the server starts without a key.
+    print(f"WARNING: Gemini API not configured. Tool will fail. Reason: {e}")
+def extract_text_from_html(html_content: str) -> str:
+    """
+    Parses an HTML string and extracts all human-readable text from the body.
+    """
+    if not html_content:
+        return ""
+    soup = BeautifulSoup(html_content, "html.parser")
+    for script_or_style in soup(["script", "style"]):
+        script_or_style.decompose()
+    text = soup.get_text(separator=" ", strip=True)
+    return text
+async def generate_document_insights(document_text: str) -> dict:
+    """
+    Analyzes a string of text using Gemini to provide a summary and key points.
+    """
+    try:
+        if not document_text.strip():
+            return {
+                "error": "Could not extract any meaningful text from the provided content."
+            }
+        # CORRECTED MODEL: Using gemini-1.5-flash, a powerful and efficient model.
+        model = genai.GenerativeModel(model_name="gemini-2.5-flash")
+        prompt = f"""
+        You are an expert financial analyst who specializes in interpreting payslips and financial documents.
+        Based on the text below, which was extracted from a payslip, perform two tasks:
+        1.  **Summarize**: Create a concise, one-sentence summary of the payslip, focusing on the final net pay.
+        2.  **Extract Key Figures**: Identify and list the most important financial figures as bullet points. Categorize them into "Earnings," and "Deductions."
+        **Document Text:**
+        ---
+        {document_text}
+        ---
+        Please format your response as a valid JSON object with three keys: "summary" (a string), "earnings" (an array of strings), and "deductions" (an array of strings).
+        Example Format:
+        {{
+          "summary": "This payslip shows a net pay of [Net Pay Amount] after calculating total earnings and deductions.",
+          "earnings": [
+            "Basic Salary: 1800.00",
+            "Total Overtime: 368.16",
+            "Housing Allowance: 450.00"
+          ],
+          "deductions": [
+            "Advance (Week 1): -300.00",
+            "Employee Loan: -80.00",
+            "Employee CPF: -191.00"
+          ]
+        }}
+        """
+        response = await model.generate_content_async(prompt)
+        cleaned_response_text = (
+            response.text.strip().replace("```json", "").replace("```", "").strip()
+        )
+        insights = json.loads(cleaned_response_text)
+        return insights
+    except google_exceptions.ResourceExhausted as e:
+        return {
+            "error": f"Gemini API quota exceeded. Please try again later. Details: {e}"
+        }
+    except json.JSONDecodeError:
+        return {
+            "summary": "Could not parse the AI's response.",
+            "key_points": [response.text],
+        }
+    except Exception as e:
+        # This will now catch the ValueError from the configuration step if the key is missing.
+        return {
+            "error": f"An unexpected error occurred during document analysis: {str(e)}"
+        }
+def _execute_single_google_search(query: str, max_results: int) -> list[dict]:
+    """(Internal Helper) Performs a single synchronous web search using Google."""
+    logger.info(f"Executing web search for query: '{query}'...")
+    try:
+        # ADAPTATION: Fetch keys directly from environment variables.
+        api_key = os.getenv("GEMINI_API_KEY")
+        cse_id = os.getenv("GOOGLE_CSE_ID")
+        if not api_key or not cse_id:
+            raise ValueError(
+                "GEMINI_API_KEY and GOOGLE_CSE_ID must be set in the environment."
+            )
+        service = build("customsearch", "v1", developerKey=api_key)
+        params = {"q": query, "cx": cse_id, "num": max_results}
+        res = service.cse().list(**params).execute()
+        search_items = res.get("items", [])
+        # MODIFICATION: Added 'snippet' for better RAG context.
+        results = [
+            {
+                "title": item.get("title", "Untitled"),
+                "href": item.get("link"),
+                "snippet": item.get("snippet", "No snippet available."),
+            }
+            for item in search_items
+            if item.get("link")
+        ]
+        logger.info(f"Found {len(results)} web results for query: '{query}'")
+        return results
+    except Exception as e:
+        logger.error(f"An error occurred during web search for '{query}': {e}")
+        return []
+async def perform_searches_and_get_hits(
+    queries: list[str], executor: ThreadPoolExecutor, max_results_per_query: int = 3
+) -> list[dict]:
+    """Asynchronously runs multiple Google searches and returns a de-duplicated list of hits."""
+    if not queries:
+        return []
+    logger.info(f"\n--- Starting concurrent web search for {len(queries)} queries ---")
+    loop = asyncio.get_running_loop()
+    # ADAPTATION: Removed settings dependency.
+    search_tasks = [
+        partial(_execute_single_google_search, query, max_results_per_query)
+        for query in queries
+    ]
+    search_coroutines = [loop.run_in_executor(executor, task) for task in search_tasks]
+    list_of_hit_lists = await asyncio.gather(*search_coroutines)
+    unique_hits = {
+        hit["href"]: hit for hit in itertools.chain.from_iterable(list_of_hit_lists)
+    }
+    final_hits = list(unique_hits.values())
+    logger.info(
+        f"--- Web search complete. Found {len(final_hits)} unique items in total. ---"
+    )
+    return final_hits
+# --- MODIFIED: Keyword Analysis Tool now uses the full search pipeline ---
+async def analyze_keywords_with_web_search(document_text: str, config: dict) -> dict:
+    """
+    Analyzes and verifies keywords using a two-agent RAG process with
+    efficient batching for verification to avoid rate limits.
+    """
+    try:
+        model = genai.GenerativeModel(model_name="gemini-2.0-flash")
+        analysis_type = config.get("analysis_type", "accuracy")
+        search_context = config.get("search_context", "public records")
+        # --- Agent 1: Keyword/Claim Extraction (1 API Call) ---
+        logger.info("Agent 1: Extracting keywords from document...")
+        keyword_extraction_prompt = f"""
+        You are an expert analyst specializing in document verification. Based on the document text below,
+        identify and extract up to 5 critical keywords, figures, or claims that must be verified for {analysis_type}
+        within the context of "{search_context}".
+        Focus on terms that are verifiable against external sources.
+        Return your findings as a valid JSON array of strings.
+        Document Text:
+        ---
+        {document_text}
+        ---
+        """
+        response_agent1 = await model.generate_content_async(keyword_extraction_prompt)
+        cleaned_agent1_response = (
+            response_agent1.text.strip()
+            .replace("```json", "")
+            .replace("```", "")
+            .strip()
+        )
+        try:
+            keywords_to_verify = json.loads(cleaned_agent1_response)
+        except json.JSONDecodeError:
+            return {
+                "error": "Agent 1 (Keyword Extractor) failed to return valid JSON.",
+                "raw_response": cleaned_agent1_response,
+            }
+        if not keywords_to_verify:
+            return {
+                "message": "No keywords were identified for verification.",
+                "verification_results": [],
+            }
+        logger.info(
+            f"Agent 1 found {len(keywords_to_verify)} keywords: {keywords_to_verify}"
+        )
+        # --- (Optional) Polite Delay ---
+        # A small pause between the two main API calls. Not strictly needed for rate
+        # limiting anymore, but can be good practice.
+        await asyncio.sleep(2)
+        # --- Live Web Search (No API Calls to Gemini) ---
+        dork_queries = [
+            f'"{keyword}" AND "{search_context}"' for keyword in keywords_to_verify
+        ]
+        with ThreadPoolExecutor() as executor:
+            all_search_hits = await perform_searches_and_get_hits(
+                dork_queries, executor
+            )
+        # --- Agent 2: Batch Verification (1 API Call for all keywords) ---
+        logger.info("Agent 2: Starting batch verification for all keywords...")
+        # Step 1: Prepare the evidence for each claim
+        verification_items_for_prompt = []
+        for keyword in keywords_to_verify:
+            relevant_hits = [
+                hit
+                for hit in all_search_hits
+                if keyword.lower() in hit.get("title", "").lower()
+                or keyword.lower() in hit.get("snippet", "").lower()
+            ]
+            web_snippets = (
+                "\n".join([f"- {hit['snippet']}" for hit in relevant_hits[:3]])
+                if relevant_hits
+                else "No specific information found on the web."
+            )
+            # Create a formatted block for each item to be verified
+            item_block = f'Claim: "{keyword}"\n' f"Evidence:\n{web_snippets}\n" f"---"
+            verification_items_for_prompt.append(item_block)
+        # Step 2: Create a single, powerful batch prompt
+        batch_verification_prompt = f"""
+        You are a verification agent. For EACH of the following claims, assess its {analysis_type} based ONLY on the provided evidence.
+        Your response MUST be a valid JSON array, where each object has three keys: "claim", "summary", and "status".
+        The status must be one of: "Verified", "Contradicted", or "Needs Manual Review".
+        Here are the claims to verify:
+        {''.join(verification_items_for_prompt)}
+        Provide only the JSON array as your final answer. Do not include markdown backticks.
+        """
+        # Step 3: Make a single API call for all verifications
+        response_agent2 = await model.generate_content_async(batch_verification_prompt)
+        cleaned_agent2_response = (
+            response_agent2.text.strip()
+            .replace("```json", "")
+            .replace("```", "")
+            .strip()
+        )
+        # Step 4: Parse the batch response
+        try:
+            verification_results = json.loads(cleaned_agent2_response)
+        except json.JSONDecodeError:
+            logger.error(
+                f"Agent 2 (Verifier) failed to return valid JSON in batch mode. Raw response: {cleaned_agent2_response}"
+            )
+            return {
+                "error": "Agent 2 (Verifier) failed to return valid JSON in batch mode.",
+                "raw_response": cleaned_agent2_response,
+            }
+        logger.info("Agent 2: Batch verification complete.")
+        return {
+            "analysis_configuration": config,
+            "verification_results": verification_results,
+        }
+    except Exception as e:
+        logger.error(
+            f"An unexpected error occurred in the keyword analysis tool: {str(e)}",
+            exc_info=True,
+        )
+        return {
+            "error": f"An unexpected error occurred in the keyword analysis tool: {str(e)}"
+        }

tools/visualization.html ADDED Viewed

	@@ -0,0 +1,189 @@

+<style>
+.lx-highlight { position: relative; border-radius:3px; padding:1px 2px;}
+.lx-highlight .lx-tooltip {
+  visibility: hidden;
+  opacity: 0;
+  transition: opacity 0.2s ease-in-out;
+  background: #333;
+  color: #fff;
+  text-align: left;
+  border-radius: 4px;
+  padding: 6px 8px;
+  position: absolute;
+  z-index: 1000;
+  bottom: 125%;
+  left: 50%;
+  transform: translateX(-50%);
+  font-size: 12px;
+  max-width: 240px;
+  white-space: normal;
+  box-shadow: 0 2px 6px rgba(0,0,0,0.3);
+}
+.lx-highlight:hover .lx-tooltip { visibility: visible; opacity:1; }
+.lx-animated-wrapper { max-width: 100%; font-family: Arial, sans-serif; }
+.lx-controls {
+  background: #fafafa; border: 1px solid #90caf9; border-radius: 8px;
+  padding: 12px; margin-bottom: 16px;
+}
+.lx-button-row {
+  display: flex; justify-content: center; gap: 8px; margin-bottom: 12px;
+}
+.lx-control-btn {
+  background: #4285f4; color: white; border: none; border-radius: 4px;
+  padding: 8px 16px; cursor: pointer; font-size: 13px; font-weight: 500;
+  transition: background-color 0.2s;
+}
+.lx-control-btn:hover { background: #3367d6; }
+.lx-progress-container {
+  margin-bottom: 8px;
+}
+.lx-progress-slider {
+  width: 100%; margin: 0; appearance: none; height: 6px;
+  background: #ddd; border-radius: 3px; outline: none;
+}
+.lx-progress-slider::-webkit-slider-thumb {
+  appearance: none; width: 18px; height: 18px; background: #4285f4;
+  border-radius: 50%; cursor: pointer;
+}
+.lx-progress-slider::-moz-range-thumb {
+  width: 18px; height: 18px; background: #4285f4; border-radius: 50%;
+  cursor: pointer; border: none;
+}
+.lx-status-text {
+  text-align: center; font-size: 12px; color: #666; margin-top: 4px;
+}
+.lx-text-window {
+  font-family: monospace; white-space: pre-wrap; border: 1px solid #90caf9;
+  padding: 12px; max-height: 260px; overflow-y: auto; margin-bottom: 12px;
+  line-height: 1.6;
+}
+.lx-attributes-panel {
+  background: #fafafa; border: 1px solid #90caf9; border-radius: 6px;
+  padding: 8px 10px; margin-top: 8px; font-size: 13px;
+}
+.lx-current-highlight {
+  border-bottom: 4px solid #ff4444;
+  font-weight: bold;
+  animation: lx-pulse 1s ease-in-out;
+}
+@keyframes lx-pulse {
+  0% { text-decoration-color: #ff4444; }
+  50% { text-decoration-color: #ff0000; }
+  100% { text-decoration-color: #ff4444; }
+}
+.lx-legend {
+  font-size: 12px; margin-bottom: 8px;
+  padding-bottom: 8px; border-bottom: 1px solid #e0e0e0;
+}
+.lx-label {
+  display: inline-block;
+  padding: 2px 4px;
+  border-radius: 3px;
+  margin-right: 4px;
+  color: #000;
+}
+.lx-attr-key {
+  font-weight: 600;
+  color: #1565c0;
+  letter-spacing: 0.3px;
+}
+.lx-attr-value {
+  font-weight: 400;
+  opacity: 0.85;
+  letter-spacing: 0.2px;
+}
+/* Add optimizations with larger fonts and better readability for GIFs */
+.lx-gif-optimized .lx-text-window { font-size: 16px; line-height: 1.8; }
+.lx-gif-optimized .lx-attributes-panel { font-size: 15px; }
+.lx-gif-optimized .lx-current-highlight { text-decoration-thickness: 4px; }
+</style>
+<div class="lx-animated-wrapper lx-gif-optimized">
+  <div class="lx-attributes-panel">
+    <div class="lx-legend">Highlights Legend: <span class="lx-label" style="background-color:#D2E3FC;">character</span> <span class="lx-label" style="background-color:#C8E6C9;">emotion</span> <span class="lx-label" style="background-color:#FEF0C3;">relationship</span></div>
+    <div id="attributesContainer"></div>
+  </div>
+  <div class="lx-text-window" id="textWindow">
+    <span class="lx-highlight lx-current-highlight" data-idx="0" style="background-color:#D2E3FC;">Lady Juliet</span> gazed longingly at the stars, her <span class="lx-highlight" data-idx="1" style="background-color:#C8E6C9;">heart aching</span> <span class="lx-highlight" data-idx="2" style="background-color:#FEF0C3;">for Romeo</span>
+  </div>
+  <div class="lx-controls">
+    <div class="lx-button-row">
+      <button class="lx-control-btn" onclick="playPause()">▶️ Play</button>
+      <button class="lx-control-btn" onclick="prevExtraction()">⏮ Previous</button>
+      <button class="lx-control-btn" onclick="nextExtraction()">⏭ Next</button>
+    </div>
+    <div class="lx-progress-container">
+      <input type="range" id="progressSlider" class="lx-progress-slider"
+             min="0" max="2" value="0"
+             onchange="jumpToExtraction(this.value)">
+    </div>
+    <div class="lx-status-text">
+      Entity <span id="entityInfo">1/3</span> |
+      Pos <span id="posInfo">[0-11]</span>
+    </div>
+  </div>
+</div>
+<script>
+  (function() {
+    const extractions = [{"index": 0, "class": "character", "text": "Lady Juliet", "color": "#D2E3FC", "startPos": 0, "endPos": 11, "beforeText": "", "extractionText": "Lady Juliet", "afterText": " gazed longingly at the stars, her heart aching for Romeo", "attributesHtml": "<div><strong>class:</strong> character</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">emotional_state</span>: <span class=\"lx-attr-value\">longing</span>}</div>"}, {"index": 1, "class": "emotion", "text": "heart aching", "color": "#C8E6C9", "startPos": 46, "endPos": 58, "beforeText": "Lady Juliet gazed longingly at the stars, her ", "extractionText": "heart aching", "afterText": " for Romeo", "attributesHtml": "<div><strong>class:</strong> emotion</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">feeling</span>: <span class=\"lx-attr-value\">ache</span>}</div>"}, {"index": 2, "class": "relationship", "text": "for Romeo", "color": "#FEF0C3", "startPos": 59, "endPos": 68, "beforeText": "Lady Juliet gazed longingly at the stars, her heart aching ", "extractionText": "for Romeo", "afterText": "", "attributesHtml": "<div><strong>class:</strong> relationship</div><div><strong>attributes:</strong> {<span class=\"lx-attr-key\">type</span>: <span class=\"lx-attr-value\">love</span>}</div>"}];
+    let currentIndex = 0;
+    let isPlaying = false;
+    let animationInterval = null;
+    let animationSpeed = 1.0;
+    function updateDisplay() {
+      const extraction = extractions[currentIndex];
+      if (!extraction) return;
+      document.getElementById('attributesContainer').innerHTML = extraction.attributesHtml;
+      document.getElementById('entityInfo').textContent = (currentIndex + 1) + '/' + extractions.length;
+      document.getElementById('posInfo').textContent = '[' + extraction.startPos + '-' + extraction.endPos + ']';
+      document.getElementById('progressSlider').value = currentIndex;
+      const playBtn = document.querySelector('.lx-control-btn');
+      if (playBtn) playBtn.textContent = isPlaying ? '⏸ Pause' : '▶️ Play';
+      const prevHighlight = document.querySelector('.lx-text-window .lx-current-highlight');
+      if (prevHighlight) prevHighlight.classList.remove('lx-current-highlight');
+      const currentSpan = document.querySelector('.lx-text-window span[data-idx="' + currentIndex + '"]');
+      if (currentSpan) {
+        currentSpan.classList.add('lx-current-highlight');
+        currentSpan.scrollIntoView({block: 'center', behavior: 'smooth'});
+      }
+    }
+    function nextExtraction() {
+      currentIndex = (currentIndex + 1) % extractions.length;
+      updateDisplay();
+    }
+    function prevExtraction() {
+      currentIndex = (currentIndex - 1 + extractions.length) % extractions.length;
+      updateDisplay();
+    }
+    function jumpToExtraction(index) {
+      currentIndex = parseInt(index);
+      updateDisplay();
+    }
+    function playPause() {
+      if (isPlaying) {
+        clearInterval(animationInterval);
+        isPlaying = false;
+      } else {
+        animationInterval = setInterval(nextExtraction, animationSpeed * 1000);
+        isPlaying = true;
+      }
+      updateDisplay();
+    }
+    window.playPause = playPause;
+    window.nextExtraction = nextExtraction;
+    window.prevExtraction = prevExtraction;
+    window.jumpToExtraction = jumpToExtraction;
+    updateDisplay();
+  })();
+</script>