# # app.py # import gradio as gr # from transformers import pipeline # import torch # from PIL import Image # import io # import fitz # PyMuPDF # # --- Model Loading --- # # Nougat is typically used for PDF/document image OCR. # #The `facebook/nougat-small` model is a good starting point. # # Using 'facebook/nougat-base' or 'facebook/nougat-large' is more accurate but requires more GPU memory/power. # try: # # Set up the device based on availability # device = "cuda:0" if torch.cuda.is_available() else "cpu" # # Load the Nougat pipeline # # The task is technically 'document-image-to-text' but can be inferred by the model name # nougat_pipeline = pipeline( # "image-to-text", # model="facebook/nougat-small", # device=device, # # Set max_new_tokens for the output length # max_new_tokens=1024, # # Set to False to prevent a warning about the model not having an image-to-text pipeline # # (The pipeline can still wrap the VisionEncoderDecoder model) # trust_remote_code=True # ) # print(f"Nougat model loaded successfully on {device}") # except Exception as e: # # Fallback/error handling for model loading # print(f"Error loading Nougat model: {e}") # nougat_pipeline = None # # --- OCR Function --- # def nougat_ocr(document): # """Performs Nougat OCR on a single-page document image or PDF.""" # if nougat_pipeline is None: # return "Error: Nougat model failed to load. Check your Space hardware and dependencies." # # Handle File object from Gradio (could be an image or a PDF) # file_path = document.name # # 1. Convert PDF (or first page of PDF) to an image # if file_path.lower().endswith(('.pdf')): # try: # # Open PDF using PyMuPDF (fitz) # doc = fitz.open(file_path) # if len(doc) == 0: # return "Error: PDF contains no pages." # # Render the first page at a high DPI for better OCR # page = doc.load_page(0) # pix = page.get_pixmap(dpi=300) # # Convert pixmap to PIL Image # img_data = pix.tobytes("png") # image = Image.open(io.BytesIO(img_data)) # doc.close() # except Exception as e: # return f"Error processing PDF: {e}" # # 2. Handle image file (png, jpg, etc.) # elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')): # image = Image.open(file_path).convert("RGB") # else: # return "Error: Unsupported file format. Please upload an image or a PDF." # # 3. Perform OCR inference # try: # # Pass the PIL image to the pipeline # output = nougat_pipeline(image) # # The output is typically a list of dicts: [{'generated_text': '...'}] # markdown_text = output[0]['generated_text'] if output else "OCR failed to generate text." # return markdown_text # except Exception as e: # return f"An error occurred during OCR: {e}" # # --- Gradio Interface --- # title = "🍫 Nougat OCR for Documents" # description = "Upload a single-page document image (PNG/JPG) or a PDF to transcribe it into Markdown format using the Nougat-small model. **Note: For multi-page PDFs, only the first page is processed.**" # iface = gr.Interface( # fn=nougat_ocr, # inputs=gr.File( # label="Upload Document (Image or PDF)", # file_types=["image", ".pdf"], # file_count="single" # ), # outputs=gr.Markdown(label="Generated Markdown Output"), # title=title, # description=description, # allow_flagging="auto", # theme=gr.themes.Soft() # ) # if __name__ == "__main__": # iface.launch() import gradio as gr from transformers import pipeline import torch from PIL import Image import io import fitz # PyMuPDF import os import tempfile # Used for creating temporary files for download # --- Model Loading --- try: # Set up the device based on availability device = "cuda:0" if torch.cuda.is_available() else "cpu" # Load the Nougat pipeline nougat_pipeline = pipeline( "image-to-text", model="facebook/nougat-small", device=device, max_new_tokens=1024, trust_remote_code=True ) print(f"Nougat model loaded successfully on {device}") except Exception as e: print(f"Error loading Nougat model: {e}") nougat_pipeline = None # --- OCR Function (Revised for Multi-Page PDF and TXT Output) --- def nougat_ocr(document): """ Performs Nougat OCR on all pages of a PDF, aggregates results, and saves them to a temporary TXT file for download. """ if nougat_pipeline is None: return None, "Error: Nougat model failed to load. Check dependencies." file_path = document.name all_markdown_text = [] # Only PDF processing is supported for the multi-page output logic if not file_path.lower().endswith(('.pdf')): return None, "Error: Please upload a PDF file for multi-page processing." try: doc = fitz.open(file_path) if len(doc) == 0: return None, "Error: PDF contains no pages." # Process pages one by one for i in range(len(doc)): page = doc.load_page(i) # Render the page to a high-DPI image pix = page.get_pixmap(dpi=300) # Convert pixmap to PIL Image img_data = pix.tobytes("png") image = Image.open(io.BytesIO(img_data)).convert("RGB") # Perform OCR inference for this page output = nougat_pipeline(image) markdown_text = output[0]['generated_text'] if output else "[OCR FAILED FOR PAGE {}]".format(i+1) # Add a clear separator and the page content all_markdown_text.append(f"\n\n\n# --- PAGE {i+1} ---\n\n{markdown_text}") doc.close() aggregated_text = "".join(all_markdown_text) # Create a temporary file to save the result with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as tmp_file: tmp_file.write(aggregated_text) temp_file_path = tmp_file.name # The function returns the file path and the text content return temp_file_path, aggregated_text except Exception as e: return None, f"An error occurred during processing: {e}" # --- Gradio Interface --- title = "🍫 Multi-Page Nougat OCR to TXT" description = "Upload a PDF. The model will process each page sequentially and output a TXT file for download, along with a Markdown preview." iface = gr.Interface( fn=nougat_ocr, inputs=gr.File( label="Upload PDF Document", file_types=[".pdf"], # Restrict to PDF file_count="single" ), outputs=[ gr.File(label="Download OCR Output (.txt)", file_count="single", file_types=[".txt"]), # Downloadable file gr.Markdown(label="Preview (Formatted Markdown)", visible=True) # Preview output ], title=title, description=description, allow_flagging="auto", theme=gr.themes.Soft() ) if __name__ == "__main__": iface.launch()