File size: 7,238 Bytes
5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef 5759d4b 5b5cfef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
# # app.py
# import gradio as gr
# from transformers import pipeline
# import torch
# from PIL import Image
# import io
# import fitz # PyMuPDF
# # --- Model Loading ---
# # Nougat is typically used for PDF/document image OCR.
# #The `facebook/nougat-small` model is a good starting point.
# # Using 'facebook/nougat-base' or 'facebook/nougat-large' is more accurate but requires more GPU memory/power.
# try:
# # Set up the device based on availability
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# # Load the Nougat pipeline
# # The task is technically 'document-image-to-text' but can be inferred by the model name
# nougat_pipeline = pipeline(
# "image-to-text",
# model="facebook/nougat-small",
# device=device,
# # Set max_new_tokens for the output length
# max_new_tokens=1024,
# # Set to False to prevent a warning about the model not having an image-to-text pipeline
# # (The pipeline can still wrap the VisionEncoderDecoder model)
# trust_remote_code=True
# )
# print(f"Nougat model loaded successfully on {device}")
# except Exception as e:
# # Fallback/error handling for model loading
# print(f"Error loading Nougat model: {e}")
# nougat_pipeline = None
# # --- OCR Function ---
# def nougat_ocr(document):
# """Performs Nougat OCR on a single-page document image or PDF."""
# if nougat_pipeline is None:
# return "Error: Nougat model failed to load. Check your Space hardware and dependencies."
# # Handle File object from Gradio (could be an image or a PDF)
# file_path = document.name
# # 1. Convert PDF (or first page of PDF) to an image
# if file_path.lower().endswith(('.pdf')):
# try:
# # Open PDF using PyMuPDF (fitz)
# doc = fitz.open(file_path)
# if len(doc) == 0:
# return "Error: PDF contains no pages."
# # Render the first page at a high DPI for better OCR
# page = doc.load_page(0)
# pix = page.get_pixmap(dpi=300)
# # Convert pixmap to PIL Image
# img_data = pix.tobytes("png")
# image = Image.open(io.BytesIO(img_data))
# doc.close()
# except Exception as e:
# return f"Error processing PDF: {e}"
# # 2. Handle image file (png, jpg, etc.)
# elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
# image = Image.open(file_path).convert("RGB")
# else:
# return "Error: Unsupported file format. Please upload an image or a PDF."
# # 3. Perform OCR inference
# try:
# # Pass the PIL image to the pipeline
# output = nougat_pipeline(image)
# # The output is typically a list of dicts: [{'generated_text': '...'}]
# markdown_text = output[0]['generated_text'] if output else "OCR failed to generate text."
# return markdown_text
# except Exception as e:
# return f"An error occurred during OCR: {e}"
# # --- Gradio Interface ---
# title = "🍫 Nougat OCR for Documents"
# description = "Upload a single-page document image (PNG/JPG) or a PDF to transcribe it into Markdown format using the Nougat-small model. **Note: For multi-page PDFs, only the first page is processed.**"
# iface = gr.Interface(
# fn=nougat_ocr,
# inputs=gr.File(
# label="Upload Document (Image or PDF)",
# file_types=["image", ".pdf"],
# file_count="single"
# ),
# outputs=gr.Markdown(label="Generated Markdown Output"),
# title=title,
# description=description,
# allow_flagging="auto",
# theme=gr.themes.Soft()
# )
# if __name__ == "__main__":
# iface.launch()
import gradio as gr
from transformers import pipeline
import torch
from PIL import Image
import io
import fitz # PyMuPDF
import os
import tempfile # Used for creating temporary files for download
# --- Model Loading ---
try:
# Set up the device based on availability
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load the Nougat pipeline
nougat_pipeline = pipeline(
"image-to-text",
model="facebook/nougat-small",
device=device,
max_new_tokens=1024,
trust_remote_code=True
)
print(f"Nougat model loaded successfully on {device}")
except Exception as e:
print(f"Error loading Nougat model: {e}")
nougat_pipeline = None
# --- OCR Function (Revised for Multi-Page PDF and TXT Output) ---
def nougat_ocr(document):
"""
Performs Nougat OCR on all pages of a PDF, aggregates results,
and saves them to a temporary TXT file for download.
"""
if nougat_pipeline is None:
return None, "Error: Nougat model failed to load. Check dependencies."
file_path = document.name
all_markdown_text = []
# Only PDF processing is supported for the multi-page output logic
if not file_path.lower().endswith(('.pdf')):
return None, "Error: Please upload a PDF file for multi-page processing."
try:
doc = fitz.open(file_path)
if len(doc) == 0:
return None, "Error: PDF contains no pages."
# Process pages one by one
for i in range(len(doc)):
page = doc.load_page(i)
# Render the page to a high-DPI image
pix = page.get_pixmap(dpi=300)
# Convert pixmap to PIL Image
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data)).convert("RGB")
# Perform OCR inference for this page
output = nougat_pipeline(image)
markdown_text = output[0]['generated_text'] if output else "[OCR FAILED FOR PAGE {}]".format(i+1)
# Add a clear separator and the page content
all_markdown_text.append(f"\n\n\n# --- PAGE {i+1} ---\n\n{markdown_text}")
doc.close()
aggregated_text = "".join(all_markdown_text)
# Create a temporary file to save the result
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as tmp_file:
tmp_file.write(aggregated_text)
temp_file_path = tmp_file.name
# The function returns the file path and the text content
return temp_file_path, aggregated_text
except Exception as e:
return None, f"An error occurred during processing: {e}"
# --- Gradio Interface ---
title = "🍫 Multi-Page Nougat OCR to TXT"
description = "Upload a PDF. The model will process each page sequentially and output a TXT file for download, along with a Markdown preview."
iface = gr.Interface(
fn=nougat_ocr,
inputs=gr.File(
label="Upload PDF Document",
file_types=[".pdf"], # Restrict to PDF
file_count="single"
),
outputs=[
gr.File(label="Download OCR Output (.txt)", file_count="single", file_types=[".txt"]), # Downloadable file
gr.Markdown(label="Preview (Formatted Markdown)", visible=True) # Preview output
],
title=title,
description=description,
allow_flagging="auto",
theme=gr.themes.Soft()
)
if __name__ == "__main__":
iface.launch() |