Spaces:

heerjtdev
/

yolo_layoutlm

Sleeping

App Files Files Community

heerjtdev commited on 29 days ago

Commit

3541663

verified ·

1 Parent(s): f0a088d

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +113 -110

working_yolo_pipeline.py CHANGED Viewed

@@ -579,6 +579,9 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
     return converted_ocr_output
 # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
 #                             page_num: int, fitz_page: fitz.Page,
 #                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -587,7 +590,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #     1. Run YOLO to find Equations/Tables.
 #     2. Mask raw text with YOLO boxes.
 #     3. Run Column Detection on the MASKED data.
-#     4. Proceed with OCR and Output.
 #     """
 #     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -598,7 +601,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #         return None, None
 #     # ====================================================================
-#     # --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
 #     # ====================================================================
 #     start_time_yolo = time.time()
 #     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
@@ -613,16 +616,14 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #                 relevant_detections.append(
 #                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
 #                 )
-#     non_nested_detections = filter_nested_boxes(relevant_detections, ioa_threshold=0.85)
-#     # 2. Then, run your standard overlap merge on what remains
-#     merged_detections = merge_overlapping_boxes(non_nested_detections, IOU_MERGE_THRESHOLD)
-#     # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
 #     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
 #     # ====================================================================
 #     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
 #     # ====================================================================
 #     raw_words_for_layout = get_word_data_for_detection(
 #         fitz_page, pdf_path, page_num,
 #         top_margin_percent=0.10, bottom_margin_percent=0.10
@@ -631,7 +632,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
 #     # ====================================================================
-#     # --- STEP 3: COLUMN DETECTION (MASKED + COVERAGE + BRIDGING CHECK) ---
 #     # ====================================================================
 #     page_width_pdf = fitz_page.rect.width
 #     page_height_pdf = fitz_page.rect.height
@@ -698,45 +699,88 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #     # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
 #     # ====================================================================
 #     raw_ocr_output = []
-#     scale_factor = 2.0
 #     try:
 #         raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
 #     except Exception as e:
 #         print(f"  ❌ Native text extraction failed: {e}")
 #     if not raw_ocr_output:
 #         if _ocr_cache.has_ocr(pdf_path, page_num):
 #             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
 #             cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
 #             for word_tuple in cached_word_data:
 #                 word_text, x1, y1, x2, y2 = word_tuple
 #                 x1_pix = int(x1 * scale_factor)
 #                 y1_pix = int(y1 * scale_factor)
 #                 x2_pix = int(x2 * scale_factor)
 #                 y2_pix = int(y2 * scale_factor)
 #                 raw_ocr_output.append({
 #                     'type': 'text', 'word': word_text, 'confidence': 95.0,
 #                     'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
 #                     'y0': y1_pix, 'x0': x1_pix
 #                 })
 #         else:
 #             try:
-#                 pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
-#                 hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
 #                 for i in range(len(hocr_data['level'])):
 #                     text = hocr_data['text'][i].strip()
 #                     if text and hocr_data['conf'][i] > -1:
-#                         x1 = int(hocr_data['left'][i])
-#                         y1 = int(hocr_data['top'][i])
-#                         x2 = x1 + int(hocr_data['width'][i])
-#                         y2 = y1 + int(hocr_data['height'][i])
 #                         raw_ocr_output.append({
-#                             'type': 'text', 'word': text, 'confidence': float(hocr_data['conf'][i]),
-#                             'bbox': [x1, y1, x2, y2], 'y0': y1, 'x0': x1
 #                         })
 #             except Exception as e:
 #                 print(f"  ❌ Tesseract OCR Error: {e}")
 #     # ====================================================================
 #     # --- STEP 6: OCR CLEANING AND MERGING ---
@@ -746,6 +790,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #     for ocr_word in raw_ocr_output:
 #         is_suppressed = False
 #         for component in component_metadata:
 #             ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
 #             if ioa > IOA_SUPPRESSION_THRESHOLD:
 #                 is_suppressed = True
@@ -753,6 +798,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
 #         if not is_suppressed:
 #             items_to_sort.append(ocr_word)
 #     items_to_sort.extend(component_metadata)
 #     # ====================================================================
@@ -799,11 +845,11 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                             page_num: int, fitz_page: fitz.Page,
                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
     """
-    OPTIMIZED FLOW:
     1. Run YOLO to find Equations/Tables.
     2. Mask raw text with YOLO boxes.
-    3. Run Column Detection on the MASKED data.
-    4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
     """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -818,7 +864,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     # ====================================================================
     start_time_yolo = time.time()
     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
     relevant_detections = []
     if results and results[0].boxes:
         for box in results[0].boxes:
@@ -834,9 +880,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
     # ====================================================================
-    # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
     # ====================================================================
-    # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
     raw_words_for_layout = get_word_data_for_detection(
         fitz_page, pdf_path, page_num,
         top_margin_percent=0.10, bottom_margin_percent=0.10
@@ -848,21 +895,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     # --- STEP 3: COLUMN DETECTION ---
     # ====================================================================
     page_width_pdf = fitz_page.rect.width
-    page_height_pdf = fitz_page.rect.height
     column_detection_params = {
         'cluster_bin_size': 2, 'cluster_smoothing': 2,
         'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
     }
     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
     page_separator_x = None
     if separators:
         central_min = page_width_pdf * 0.35
         central_max = page_width_pdf * 0.65
         central_separators = [s for s in separators if central_min <= s <= central_max]
         if central_separators:
             center_x = page_width_pdf / 2
             page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
@@ -909,97 +956,41 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
         })
     # ====================================================================
-    # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
     # ====================================================================
     raw_ocr_output = []
-    scale_factor = 2.0 # Pipeline standard scale
-    try:
-        # Try getting native text first
-        raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
-    except Exception as e:
-        print(f"  ❌ Native text extraction failed: {e}")
-    # If native text is missing, fall back to OCR
-    if not raw_ocr_output:
-        if _ocr_cache.has_ocr(pdf_path, page_num):
-            print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
-            cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
-            for word_tuple in cached_word_data:
-                word_text, x1, y1, x2, y2 = word_tuple
-                # Scale from PDF points to Pipeline Pixels (2.0)
-                x1_pix = int(x1 * scale_factor)
-                y1_pix = int(y1 * scale_factor)
-                x2_pix = int(x2 * scale_factor)
-                y2_pix = int(y2 * scale_factor)
-                raw_ocr_output.append({
-                    'type': 'text', 'word': word_text, 'confidence': 95.0,
-                    'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
-                    'y0': y1_pix, 'x0': x1_pix
-                })
-        else:
-            # === START OF OPTIMIZED OCR BLOCK ===
-            try:
-                # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
-                # We do this specifically for OCR accuracy, separate from the pipeline image
-                ocr_zoom = 4.0
-                pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
-                # Convert PyMuPDF Pixmap to OpenCV format
-                img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
-                if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
-                elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
-                # 2. Preprocess (Binarization)
-                # Ensure 'preprocess_image_for_ocr' is defined at top of file!
-                processed_img = preprocess_image_for_ocr(img_ocr_np)
-                # 3. Run Tesseract with Optimized Configuration
-                # --oem 3: Default LSTM engine
-                # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
-                custom_config = r'--oem 3 --psm 6'
-                hocr_data = pytesseract.image_to_data(
-                    processed_img,
-                    output_type=pytesseract.Output.DICT,
-                    config=custom_config
-                )
-                for i in range(len(hocr_data['level'])):
-                    text = hocr_data['text'][i].strip()
-                    if text and hocr_data['conf'][i] > -1:
-                        # 4. Coordinate Mapping
-                        # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
-                        # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
-                        scale_adjustment = scale_factor / ocr_zoom
-                        x1 = int(hocr_data['left'][i] * scale_adjustment)
-                        y1 = int(hocr_data['top'][i] * scale_adjustment)
-                        w = int(hocr_data['width'][i] * scale_adjustment)
-                        h = int(hocr_data['height'][i] * scale_adjustment)
-                        x2 = x1 + w
-                        y2 = y1 + h
-                        raw_ocr_output.append({
-                            'type': 'text',
-                            'word': text,
-                            'confidence': float(hocr_data['conf'][i]),
-                            'bbox': [x1, y1, x2, y2],
-                            'y0': y1,
-                            'x0': x1
-                        })
-            except Exception as e:
-                print(f"  ❌ Tesseract OCR Error: {e}")
-            # === END OF OPTIMIZED OCR BLOCK ===
     # ====================================================================
-    # --- STEP 6: OCR CLEANING AND MERGING ---
     # ====================================================================
     items_to_sort = []
     for ocr_word in raw_ocr_output:
         is_suppressed = False
         for component in component_metadata:
@@ -1015,7 +1006,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     items_to_sort.extend(component_metadata)
     # ====================================================================
-    # --- STEP 7: LINE-BASED SORTING ---
     # ====================================================================
     items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
     lines = []
@@ -1051,6 +1042,18 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     return final_output, page_separator_x
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT

     return converted_ocr_output
 # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
 #                             page_num: int, fitz_page: fitz.Page,
 #                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
 #     1. Run YOLO to find Equations/Tables.
 #     2. Mask raw text with YOLO boxes.
 #     3. Run Column Detection on the MASKED data.
+#     4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
 #     """
 #     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
 #         return None, None
 #     # ====================================================================
+#     # --- STEP 1: YOLO DETECTION ---
 #     # ====================================================================
 #     start_time_yolo = time.time()
 #     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
 #                 relevant_detections.append(
 #                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
 #                 )
+#     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
 #     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
 #     # ====================================================================
 #     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
 #     # ====================================================================
+#     # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
 #     raw_words_for_layout = get_word_data_for_detection(
 #         fitz_page, pdf_path, page_num,
 #         top_margin_percent=0.10, bottom_margin_percent=0.10
 #     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
 #     # ====================================================================
+#     # --- STEP 3: COLUMN DETECTION ---
 #     # ====================================================================
 #     page_width_pdf = fitz_page.rect.width
 #     page_height_pdf = fitz_page.rect.height
 #     # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
 #     # ====================================================================
 #     raw_ocr_output = []
+#     scale_factor = 2.0 # Pipeline standard scale
 #     try:
+#         # Try getting native text first
 #         raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
 #     except Exception as e:
 #         print(f"  ❌ Native text extraction failed: {e}")
+#     # If native text is missing, fall back to OCR
 #     if not raw_ocr_output:
 #         if _ocr_cache.has_ocr(pdf_path, page_num):
 #             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
 #             cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
 #             for word_tuple in cached_word_data:
 #                 word_text, x1, y1, x2, y2 = word_tuple
+#                 # Scale from PDF points to Pipeline Pixels (2.0)
 #                 x1_pix = int(x1 * scale_factor)
 #                 y1_pix = int(y1 * scale_factor)
 #                 x2_pix = int(x2 * scale_factor)
 #                 y2_pix = int(y2 * scale_factor)
 #                 raw_ocr_output.append({
 #                     'type': 'text', 'word': word_text, 'confidence': 95.0,
 #                     'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
 #                     'y0': y1_pix, 'x0': x1_pix
 #                 })
 #         else:
+#             # === START OF OPTIMIZED OCR BLOCK ===
 #             try:
+#                 # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
+#                 # We do this specifically for OCR accuracy, separate from the pipeline image
+#                 ocr_zoom = 4.0
+#                 pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
+#                 # Convert PyMuPDF Pixmap to OpenCV format
+#                 img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
+#                 if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
+#                 elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
+#                 # 2. Preprocess (Binarization)
+#                 # Ensure 'preprocess_image_for_ocr' is defined at top of file!
+#                 processed_img = preprocess_image_for_ocr(img_ocr_np)
+#                 # 3. Run Tesseract with Optimized Configuration
+#                 # --oem 3: Default LSTM engine
+#                 # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
+#                 custom_config = r'--oem 3 --psm 6'
+#                 hocr_data = pytesseract.image_to_data(
+#                     processed_img,
+#                     output_type=pytesseract.Output.DICT,
+#                     config=custom_config
+#                 )
 #                 for i in range(len(hocr_data['level'])):
 #                     text = hocr_data['text'][i].strip()
 #                     if text and hocr_data['conf'][i] > -1:
+#                         # 4. Coordinate Mapping
+#                         # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
+#                         # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
+#                         scale_adjustment = scale_factor / ocr_zoom
+#                         x1 = int(hocr_data['left'][i] * scale_adjustment)
+#                         y1 = int(hocr_data['top'][i] * scale_adjustment)
+#                         w = int(hocr_data['width'][i] * scale_adjustment)
+#                         h = int(hocr_data['height'][i] * scale_adjustment)
+#                         x2 = x1 + w
+#                         y2 = y1 + h
 #                         raw_ocr_output.append({
+#                             'type': 'text',
+#                             'word': text,
+#                             'confidence': float(hocr_data['conf'][i]),
+#                             'bbox': [x1, y1, x2, y2],
+#                             'y0': y1,
+#                             'x0': x1
 #                         })
 #             except Exception as e:
 #                 print(f"  ❌ Tesseract OCR Error: {e}")
+#             # === END OF OPTIMIZED OCR BLOCK ===
 #     # ====================================================================
 #     # --- STEP 6: OCR CLEANING AND MERGING ---
 #     for ocr_word in raw_ocr_output:
 #         is_suppressed = False
 #         for component in component_metadata:
+#             # Do not include words that are inside figure/equation boxes
 #             ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
 #             if ioa > IOA_SUPPRESSION_THRESHOLD:
 #                 is_suppressed = True
 #         if not is_suppressed:
 #             items_to_sort.append(ocr_word)
+#     # Add figures/equations back into the flow as "words"
 #     items_to_sort.extend(component_metadata)
 #     # ====================================================================
                             page_num: int, fitz_page: fitz.Page,
                             pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
     """
+    OPTIMIZED FLOW:
     1. Run YOLO to find Equations/Tables.
     2. Mask raw text with YOLO boxes.
+    3. Run Column Detection on the MASKED data (Populates OCR cache).
+    4. Proceed with Final OCR Output (Strictly using the cache).
     """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
     # ====================================================================
     start_time_yolo = time.time()
     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
     relevant_detections = []
     if results and results[0].boxes:
         for box in results[0].boxes:
     print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
     # ====================================================================
+    # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING & CACHING) ---
+    # This call to get_word_data_for_detection will execute Tesseract if
+    # native words are missing, and save the result to the cache.
     # ====================================================================
     raw_words_for_layout = get_word_data_for_detection(
         fitz_page, pdf_path, page_num,
         top_margin_percent=0.10, bottom_margin_percent=0.10
     # --- STEP 3: COLUMN DETECTION ---
     # ====================================================================
     page_width_pdf = fitz_page.rect.width
+    page_height_pdf = fitz_page.rect.height
     column_detection_params = {
         'cluster_bin_size': 2, 'cluster_smoothing': 2,
         'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
     }
     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
     page_separator_x = None
     if separators:
         central_min = page_width_pdf * 0.35
         central_max = page_width_pdf * 0.65
         central_separators = [s for s in separators if central_min <= s <= central_max]
         if central_separators:
             center_x = page_width_pdf / 2
             page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
         })
     # ====================================================================
+    # --- STEP 5: CACHED OCR RETRIEVAL (No Redundant Tesseract) ---
     # ====================================================================
     raw_ocr_output = []
+    scale_factor = 2.0  # Pipeline standard scale
+    if _ocr_cache.has_ocr(pdf_path, page_num):
+        print(f"  ⚡ Using cached OCR (Native or Tesseract) for page {page_num}")
+        cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
+        for word_tuple in cached_word_data:
+            # Cache stores: (text, x1, y1, x2, y2) in PDF points (see get_word_data_for_detection)
+            word_text, x1, y1, x2, y2 = word_tuple
+            # Scale from PDF points back to Pipeline Pixels (2.0)
+            x1_pix = int(x1 * scale_factor)
+            y1_pix = int(y1 * scale_factor)
+            x2_pix = int(x2 * scale_factor)
+            y2_pix = int(y2 * scale_factor)
+            raw_ocr_output.append({
+                'type': 'text', 'word': word_text, 'confidence': 95.0, # 95.0 is a default/placeholder confidence
+                'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
+                'y0': y1_pix, 'x0': x1_pix
+            })
+    else:
+        # This branch is hit only if the cache check in Step 2 failed to produce text,
+        # meaning the page is genuinely textless or entirely composed of images/figures.
+        print(f"  ⚠️ No text found in cache for page {page_num}. Proceeding without words.")
     # ====================================================================
+    # --- STEP 6: OCR CLEANING AND MERGING (Original Logic Unchanged) ---
     # ====================================================================
     items_to_sort = []
     for ocr_word in raw_ocr_output:
         is_suppressed = False
         for component in component_metadata:
     items_to_sort.extend(component_metadata)
     # ====================================================================
+    # --- STEP 7: LINE-BASED SORTING (Original Logic Unchanged) ---
     # ====================================================================
     items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
     lines = []
     return final_output, page_separator_x
 def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT