Spaces:

adil9858
/

DOCSUM

Sleeping

App Files Files Community

adil9858 commited on May 5, 2025

Commit

0b887a8

verified ·

1 Parent(s): 46d9119

Create app.py

Browse files

Files changed (1) hide show

app.py +182 -0

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import gradio as gr
+from openai import OpenAI
+import base64
+from PIL import Image
+import io
+import fitz  # PyMuPDF
+import tempfile
+import os
+# --- HELPER FUNCTIONS ---
+def convert_pdf_to_images(pdf_file):
+    """Convert PDF to list of PIL Images"""
+    images = []
+    try:
+        # Save uploaded file to a temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(pdf_file)
+            tmp_file_path = tmp_file.name
+        # Open the PDF file
+        pdf_document = fitz.open(tmp_file_path)
+        # Iterate through each page
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            pix = page.get_pixmap()
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            images.append(img)
+        # Clean up
+        pdf_document.close()
+        os.unlink(tmp_file_path)
+    except Exception as e:
+        raise gr.Error(f"Error converting PDF: {e}")
+    return images
+def image_to_base64(image):
+    """Convert PIL Image to base64 string"""
+    with io.BytesIO() as buffer:
+        image.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+def generate_summary(extracted_texts, api_key):
+    """Generate a comprehensive summary of all extracted texts"""
+    try:
+        client = OpenAI(
+            base_url="https://openrouter.ai/api/v1",
+            api_key=api_key
+        )
+        summary_prompt = f"""
+        You are an expert document analyst. Below are the extracted contents from multiple pages of a document.
+        Please provide a comprehensive, detailed summary that:
+        1. Organizes all key information logically
+        2. Identifies relationships between data points
+        3. Highlights important figures, dates, names
+        4. Presents the information in a clear, structured format
+        Extracted contents from pages:
+        {extracted_texts}
+        Comprehensive Summary:
+        """
+        response = client.chat.completions.create(
+            model="opengvlab/internvl3-14b:free",
+            messages=[
+                {"role": "system", "content": "You are Dalton, an expert in analyzing and summarizing document contents."},
+                {"role": "user", "content": summary_prompt}
+            ],
+            max_tokens=2048
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        raise gr.Error(f"Error generating summary: {e}")
+def analyze_document(api_key, user_prompt, uploaded_file):
+    """Main processing function"""
+    if not api_key:
+        raise gr.Error("Please enter your OpenRouter API key")
+    if uploaded_file is None:
+        raise gr.Error("Please upload a document")
+    images_to_analyze = []
+    file_ext = os.path.splitext(uploaded_file.name)[1].lower()
+    # Handle PDF or image
+    if file_ext == '.pdf':
+        with open(uploaded_file.name, "rb") as f:
+            pdf_data = f.read()
+        pdf_images = convert_pdf_to_images(pdf_data)
+        images_to_analyze = pdf_images  # For simplicity, using all pages
+    else:
+        image = Image.open(uploaded_file.name)
+        images_to_analyze = [image]
+    # Process each image
+    all_results = []
+    extracted_texts = []
+    for idx, image in enumerate(images_to_analyze, 1):
+        try:
+            client = OpenAI(
+                base_url="https://openrouter.ai/api/v1",
+                api_key=api_key
+            )
+            image_base64 = image_to_base64(image)
+            response = client.chat.completions.create(
+                model="opengvlab/internvl3-14b:free",
+                messages=[
+                    {"role": "system", "content": "You are Dalton, an expert in understanding images that can analyze images and provide detailed descriptions."},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": user_prompt},
+                        {"type": "image_url", "image_url": {
+                            "url": f"data:image/png;base64,{image_base64}"
+                        }}
+                    ]}
+                ],
+                max_tokens=1024
+            )
+            result = response.choices[0].message.content
+            extracted_texts.append(f"=== Page {idx} ===\n{result}\n")
+            all_results.append(f"📄 Page {idx} Result:\n{result}\n---\n")
+        except Exception as e:
+            raise gr.Error(f"Error analyzing page {idx}: {e}")
+    # Generate summary if multiple pages
+    final_output = "\n".join(all_results)
+    if len(extracted_texts) > 1:
+        summary = generate_summary("\n".join(extracted_texts), api_key)
+        final_output += f"\n📝 Comprehensive Summary:\n{summary}"
+    return final_output
+# --- GRADIO INTERFACE ---
+with gr.Blocks(title="DocSum - Document Summarizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🧾 DocSum")
+    gr.Markdown("Document Summarizer Powered by VLM • Developed by [Koshur AI](https://koshurai.com)")
+    with gr.Row():
+        api_key = gr.Textbox(
+            label="🔑 OpenRouter API Key",
+            type="password",
+            placeholder="Enter your OpenRouter API key"
+        )
+        user_prompt = gr.Textbox(
+            label="📝 Enter Your Prompt",
+            value="Extract all content structurally",
+            placeholder="What would you like to extract?"
+        )
+    uploaded_file = gr.File(
+        label="Upload Document (PDF/Image)",
+        file_types=[".pdf", ".jpg", ".jpeg", ".png"]
+    )
+    submit_btn = gr.Button("🔍 Analyze Document", variant="primary")
+    output = gr.Textbox(
+        label="Analysis Results",
+        interactive=False,
+        lines=20,
+        max_lines=50
+    )
+    submit_btn.click(
+        fn=analyze_document,
+        inputs=[api_key, user_prompt, uploaded_file],
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.launch()