Spaces:

heerjtdev
/

nougat

Sleeping

App Files Files Community

nougat / app.py

heerjtdev

Update app.py

5759d4b verified about 1 month ago

raw

history blame contribute delete

7.24 kB

	# # app.py
	# import gradio as gr
	# from transformers import pipeline
	# import torch
	# from PIL import Image
	# import io
	# import fitz # PyMuPDF

	# # --- Model Loading ---
	# # Nougat is typically used for PDF/document image OCR.
	# #The `facebook/nougat-small` model is a good starting point.
	# # Using 'facebook/nougat-base' or 'facebook/nougat-large' is more accurate but requires more GPU memory/power.
	# try:
	# # Set up the device based on availability
	# device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# # Load the Nougat pipeline
	# # The task is technically 'document-image-to-text' but can be inferred by the model name
	# nougat_pipeline = pipeline(
	# "image-to-text",
	# model="facebook/nougat-small",
	# device=device,
	# # Set max_new_tokens for the output length
	# max_new_tokens=1024,
	# # Set to False to prevent a warning about the model not having an image-to-text pipeline
	# # (The pipeline can still wrap the VisionEncoderDecoder model)
	# trust_remote_code=True
	# )
	# print(f"Nougat model loaded successfully on {device}")

	# except Exception as e:
	# # Fallback/error handling for model loading
	# print(f"Error loading Nougat model: {e}")
	# nougat_pipeline = None


	# # --- OCR Function ---
	# def nougat_ocr(document):
	# """Performs Nougat OCR on a single-page document image or PDF."""
	# if nougat_pipeline is None:
	# return "Error: Nougat model failed to load. Check your Space hardware and dependencies."

	# # Handle File object from Gradio (could be an image or a PDF)
	# file_path = document.name

	# # 1. Convert PDF (or first page of PDF) to an image
	# if file_path.lower().endswith(('.pdf')):
	# try:
	# # Open PDF using PyMuPDF (fitz)
	# doc = fitz.open(file_path)
	# if len(doc) == 0:
	# return "Error: PDF contains no pages."

	# # Render the first page at a high DPI for better OCR
	# page = doc.load_page(0)
	# pix = page.get_pixmap(dpi=300)

	# # Convert pixmap to PIL Image
	# img_data = pix.tobytes("png")
	# image = Image.open(io.BytesIO(img_data))
	# doc.close()

	# except Exception as e:
	# return f"Error processing PDF: {e}"

	# # 2. Handle image file (png, jpg, etc.)
	# elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
	# image = Image.open(file_path).convert("RGB")
	# else:
	# return "Error: Unsupported file format. Please upload an image or a PDF."

	# # 3. Perform OCR inference
	# try:
	# # Pass the PIL image to the pipeline
	# output = nougat_pipeline(image)
	# # The output is typically a list of dicts: [{'generated_text': '...'}]
	# markdown_text = output[0]['generated_text'] if output else "OCR failed to generate text."

	# return markdown_text

	# except Exception as e:
	# return f"An error occurred during OCR: {e}"


	# # --- Gradio Interface ---
	# title = "🍫 Nougat OCR for Documents"
	# description = "Upload a single-page document image (PNG/JPG) or a PDF to transcribe it into Markdown format using the Nougat-small model. Note: For multi-page PDFs, only the first page is processed."

	# iface = gr.Interface(
	# fn=nougat_ocr,
	# inputs=gr.File(
	# label="Upload Document (Image or PDF)",
	# file_types=["image", ".pdf"],
	# file_count="single"
	# ),
	# outputs=gr.Markdown(label="Generated Markdown Output"),
	# title=title,
	# description=description,
	# allow_flagging="auto",
	# theme=gr.themes.Soft()
	# )

	# if __name__ == "__main__":
	# iface.launch()



	import gradio as gr
	from transformers import pipeline
	import torch
	from PIL import Image
	import io
	import fitz # PyMuPDF
	import os
	import tempfile # Used for creating temporary files for download

	# --- Model Loading ---
	try:
	# Set up the device based on availability
	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# Load the Nougat pipeline
	nougat_pipeline = pipeline(
	"image-to-text",
	model="facebook/nougat-small",
	device=device,
	max_new_tokens=1024,
	trust_remote_code=True
	)
	print(f"Nougat model loaded successfully on {device}")
	except Exception as e:
	print(f"Error loading Nougat model: {e}")
	nougat_pipeline = None


	# --- OCR Function (Revised for Multi-Page PDF and TXT Output) ---
	def nougat_ocr(document):
	"""
	Performs Nougat OCR on all pages of a PDF, aggregates results,
	and saves them to a temporary TXT file for download.
	"""
	if nougat_pipeline is None:
	return None, "Error: Nougat model failed to load. Check dependencies."

	file_path = document.name
	all_markdown_text = []

	# Only PDF processing is supported for the multi-page output logic
	if not file_path.lower().endswith(('.pdf')):
	return None, "Error: Please upload a PDF file for multi-page processing."

	try:
	doc = fitz.open(file_path)
	if len(doc) == 0:
	return None, "Error: PDF contains no pages."

	# Process pages one by one
	for i in range(len(doc)):
	page = doc.load_page(i)
	# Render the page to a high-DPI image
	pix = page.get_pixmap(dpi=300)

	# Convert pixmap to PIL Image
	img_data = pix.tobytes("png")
	image = Image.open(io.BytesIO(img_data)).convert("RGB")

	# Perform OCR inference for this page
	output = nougat_pipeline(image)
	markdown_text = output[0]['generated_text'] if output else "[OCR FAILED FOR PAGE {}]".format(i+1)

	# Add a clear separator and the page content
	all_markdown_text.append(f"\n\n\n# --- PAGE {i+1} ---\n\n{markdown_text}")

	doc.close()
	aggregated_text = "".join(all_markdown_text)

	# Create a temporary file to save the result
	with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as tmp_file:
	tmp_file.write(aggregated_text)
	temp_file_path = tmp_file.name

	# The function returns the file path and the text content
	return temp_file_path, aggregated_text

	except Exception as e:
	return None, f"An error occurred during processing: {e}"


	# --- Gradio Interface ---
	title = "🍫 Multi-Page Nougat OCR to TXT"
	description = "Upload a PDF. The model will process each page sequentially and output a TXT file for download, along with a Markdown preview."

	iface = gr.Interface(
	fn=nougat_ocr,
	inputs=gr.File(
	label="Upload PDF Document",
	file_types=[".pdf"], # Restrict to PDF
	file_count="single"
	),
	outputs=[
	gr.File(label="Download OCR Output (.txt)", file_count="single", file_types=[".txt"]), # Downloadable file
	gr.Markdown(label="Preview (Formatted Markdown)", visible=True) # Preview output
	],
	title=title,
	description=description,
	allow_flagging="auto",
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	iface.launch()