# ./tools/tools.py import os import json import logging import textwrap import asyncio import re import httpx import langextract as lx from bs4 import BeautifulSoup from dotenv import load_dotenv import google.generativeai as genai # Step 1: Load environment variables and configure API keys load_dotenv() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) try: api_key = os.getenv("GEMINI_API_KEY") if not api_key: raise ValueError("GEMINI_API_KEY not found in environment variables.") os.environ["LANGEXTRACT_API_KEY"] = api_key genai.configure(api_key=api_key) except ValueError as e: logger.warning(f"API not configured. Tool will fail. Reason: {e}") def extract_text_from_html(html_content: str) -> str: """ Parses an HTML string and extracts all human-readable text from the body. """ if not html_content: return "" soup = BeautifulSoup(html_content, "html.parser") for script_or_style in soup(["script", "style"]): script_or_style.decompose() text = soup.get_text(separator=" ", strip=True) return text async def _pre_clean_text_with_gemini(messy_text: str) -> str: """ Takes messy OCR text and uses Gemini to clean it into a coherent document. """ model = genai.GenerativeModel(model_name="gemini-2.5-flash") prompt = textwrap.dedent( f""" The following text is from a messy OCR process. It contains extra spaces, incorrect line breaks, and jumbled words. Your task is to clean and reformat it into a single, coherent block of text that reads like a proper document. Do not summarize or change the content. Just fix the formatting and structure. Return ONLY the cleaned text, with no explanations. **Messy Text:** --- {messy_text} --- """ ) try: response = await model.generate_content_async(prompt) return response.text.strip() except Exception as e: logger.error(f"Error during text pre-cleaning: {e}") return messy_text async def _translate_text_to_english_with_sealion(text: str) -> str: """ Translates the given text to English using the Sea-Lion model. """ url = "https://api.sea-lion.ai/v1/chat/completions" api_key = os.getenv("SEALION_API_KEY") if not api_key: logger.warning("SEALION_API_KEY not found. Skipping translation.") return text headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", } prompt = f'Translate the following text to English. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text}"' payload = { "max_completion_tokens": 4096, "messages": [{"role": "user", "content": prompt}], "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT", } async with httpx.AsyncClient() as client: try: response = await client.post( url, headers=headers, json=payload, timeout=60.0 ) response.raise_for_status() response_json = response.json() translated_text = response_json["choices"][0]["message"]["content"].strip() return re.sub(r'^"|"$', "", translated_text) except httpx.RequestError as e: logger.error(f"Translation request to Sea-Lion failed: {e}") return text except (KeyError, IndexError) as e: logger.error(f"Could not parse Sea-Lion translation response: {e}") return text async def _generate_html_summary(extracted_data: dict) -> str: """ Takes the structured data and generates a clean, user-friendly HTML summary sheet in English. """ model = genai.GenerativeModel(model_name="gemini-2.5-flash") prompt_data = json.dumps(extracted_data, indent=2, ensure_ascii=False) prompt = textwrap.dedent( f""" You are a web designer creating a one-page summary sheet. Your task is to convert the following JSON data into a simple, clean, and easy-to-read HTML document. The entire document MUST be in English. **JSON Data:** ```json {prompt_data} ``` **Instructions:** 1. Use a single HTML file structure. Include modern, clean CSS in a `