Spaces:

heerjtdev
/

yolo_layoutlm

Sleeping

App Files Files Community

heerjtdev commited on Nov 20

Commit

e9f6960

1 Parent(s): 4ff69d8

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +56 -36

working_yolo_pipeline.py CHANGED Viewed

@@ -1543,7 +1543,6 @@
 #         sys.exit(1)
 import json
 import argparse
 import os
@@ -1700,7 +1699,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
         return raw_word_data
     # 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
-    # Note: raw_word_data is in PDF points. YOLO is in Pixels (scaled by scale_factor).
     pdf_space_boxes = []
     for det in yolo_detections:
         x1, y1, x2, y2 = det['coords']
@@ -1715,7 +1713,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
     # 2. Filter out raw words that are inside YOLO boxes
     cleaned_word_data = []
     for word_tuple in raw_word_data:
-        # word_tuple format: (text, x1, y1, x2, y2, block_no, line_no, word_no)
         wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
         w_center_x = (wx1 + wx2) / 2
         w_center_y = (wy1 + wy2) / 2
@@ -1731,15 +1728,55 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
     # 3. Add the YOLO boxes themselves as "Solid Words"
     for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
-        # Create a dummy tuple matching the structure: (text, x1, y1, x2, y2)
         dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
         cleaned_word_data.append(dummy_entry)
     return cleaned_word_data
 def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
-    """Calculates the X-axis histogram and detects significant gutters."""
     if not word_data: return []
     x_points = []
@@ -1771,22 +1808,17 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
     separator_x_coords = [int(bin_edges[p]) for p in peaks]
     final_separators = []
-    # Validation: Check if gap is "blocked" significantly
-    # Since we have YOLO blocks merged in, we can trust the histogram more.
-    # But we still do a quick density check.
-    window_bins = max(1, int(10 / bin_size))
-    for peak_idx, x_coord in zip(peaks, separator_x_coords):
-        start_bin = max(0, peak_idx - window_bins)
-        end_bin = min(len(hist), peak_idx + window_bins)
-        raw_density_in_gutter = np.sum(hist[start_bin:end_bin])
-        left_density = np.sum(hist[max(0, start_bin - 20):start_bin])
-        right_density = np.sum(hist[end_bin:min(len(hist), end_bin + 20)])
-        neighbor_avg = (left_density + right_density) / 2
-        if neighbor_avg > 0 and raw_density_in_gutter < neighbor_avg * 0.4:
             final_separators.append(x_coord)
     return sorted(final_separators)
@@ -1862,7 +1894,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     OPTIMIZED FLOW:
     1. Run YOLO to find Equations/Tables.
     2. Mask raw text with YOLO boxes.
-    3. Run Column Detection on the MASKED data (prevents false positives in matrices).
     4. Proceed with OCR and Output.
     """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -1877,7 +1909,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     # --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
     # ====================================================================
     start_time_yolo = time.time()
-    # Run inference
     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
     relevant_detections = []
@@ -1891,25 +1922,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
                 )
-    # Merge overlapping boxes immediately
     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
-    print(f"    [LOG] YOLO found {len(merged_detections)} objects (Equation/Figure) in {time.time() - start_time_yolo:.3f}s.")
     # ====================================================================
     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
     # ====================================================================
-    # Get raw PDF words first
     raw_words_for_layout = get_word_data_for_detection(
         fitz_page, pdf_path, page_num,
         top_margin_percent=0.10, bottom_margin_percent=0.10
     )
-    # KEY CHANGE: Inject YOLO boxes into the word list before detection
-    # Assuming 2.0 scale factor because fitz.Matrix(2.0, 2.0) was used in main loop
     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
     # ====================================================================
-    # --- STEP 3: COLUMN DETECTION (Now running on MASKED data) ---
     # ====================================================================
     page_width_pdf = fitz_page.rect.width
     page_height_pdf = fitz_page.rect.height
@@ -1919,10 +1946,8 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
         'cluster_min_width': 20, 'cluster_threshold_percentile': 85,
     }
-    # Use calculate_x_gutters directly on the masked data
     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
-    # Finalize separator logic
     page_separator_x = None
     if separators:
         central_min = page_width_pdf * 0.35
@@ -1985,7 +2010,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
     except Exception as e:
         print(f"  ❌ Native text extraction failed: {e}")
-    # If native failed, use Tesseract
     if not raw_ocr_output:
         if _ocr_cache.has_ocr(pdf_path, page_num):
             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
@@ -2002,7 +2026,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                     'y0': y1_pix, 'x0': x1_pix
                 })
         else:
-            # Run Tesseract
             try:
                 pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
                 hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
@@ -2052,7 +2075,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
                 placed = True
                 break
         if not placed and item['type'] in ['equation', 'figure']:
-            # Relaxed tolerance for big objects
             for line in lines:
                 y_ref = min(it['y0'] for it in line)
                 if abs(y_ref - item['y0']) < 20:
@@ -2739,6 +2761,4 @@ if __name__ == "__main__":
         print(f"\n✅ Final Data Saved: {final_output_path}")
     else:
         print("\n❌ Pipeline Failed.")
-        sys.exit(1)

 #         sys.exit(1)
 import json
 import argparse
 import os
         return raw_word_data
     # 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
     pdf_space_boxes = []
     for det in yolo_detections:
         x1, y1, x2, y2 = det['coords']
     # 2. Filter out raw words that are inside YOLO boxes
     cleaned_word_data = []
     for word_tuple in raw_word_data:
         wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
         w_center_x = (wx1 + wx2) / 2
         w_center_y = (wy1 + wy2) / 2
     # 3. Add the YOLO boxes themselves as "Solid Words"
     for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
         dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
         cleaned_word_data.append(dummy_entry)
     return cleaned_word_data
+def calculate_vertical_gap_coverage(word_data: list, sep_x: int, page_height: float, gutter_width: int = 10) -> float:
+    """
+    Calculates what percentage of the page's vertical text span is 'cleanly split' by the separator.
+    A valid column split should split > 65% of the page verticality.
+    """
+    if not word_data:
+        return 0.0
+    # 1. Determine the vertical span of the actual text content
+    y_coords = [w[2] for w in word_data] + [w[4] for w in word_data] # y1 and y2
+    min_y, max_y = min(y_coords), max(y_coords)
+    total_text_height = max_y - min_y
+    if total_text_height <= 0:
+        return 0.0
+    # 2. Create a boolean array representing the Y-axis
+    gap_open_mask = np.ones(int(total_text_height) + 1, dtype=bool)
+    # Define the x-zone of the separator
+    zone_left = sep_x - (gutter_width / 2)
+    zone_right = sep_x + (gutter_width / 2)
+    # 3. Mark areas where the gap is "Blocked" by text
+    offset_y = int(min_y)
+    for _, x1, y1, x2, y2 in word_data:
+        # Check if this word horizontally interferes with the separator
+        if x2 > zone_left and x1 < zone_right:
+            y_start_idx = max(0, int(y1) - offset_y)
+            y_end_idx = min(len(gap_open_mask), int(y2) - offset_y)
+            if y_end_idx > y_start_idx:
+                gap_open_mask[y_start_idx:y_end_idx] = False
+    # 4. Calculate coverage
+    open_pixels = np.sum(gap_open_mask)
+    coverage_ratio = open_pixels / len(gap_open_mask)
+    return coverage_ratio
 def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
+    """Calculates the X-axis histogram and detects significant gutters using Vertical Coverage."""
     if not word_data: return []
     x_points = []
     separator_x_coords = [int(bin_edges[p]) for p in peaks]
     final_separators = []
+    for x_coord in separator_x_coords:
+        # KEY CHECK: Vertical Coverage
+        # A column gap must exist for >65% of the page height.
+        # Single-column text (width-spanning lines) will result in ~0% coverage.
+        coverage = calculate_vertical_gap_coverage(word_data, x_coord, page_height, gutter_width=min_width)
+        if coverage >= 0.65:
             final_separators.append(x_coord)
+            print(f"      -> Separator candidate X={x_coord} ACCEPTED (Coverage: {coverage:.1%})")
+        else:
+            print(f"      -> Separator candidate X={x_coord} REJECTED (Coverage: {coverage:.1%})")
     return sorted(final_separators)
     OPTIMIZED FLOW:
     1. Run YOLO to find Equations/Tables.
     2. Mask raw text with YOLO boxes.
+    3. Run Column Detection on the MASKED data.
     4. Proceed with OCR and Output.
     """
     global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
     # --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
     # ====================================================================
     start_time_yolo = time.time()
     results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
     relevant_detections = []
                     {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
                 )
     merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
+    print(f"    [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
     # ====================================================================
     # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
     # ====================================================================
     raw_words_for_layout = get_word_data_for_detection(
         fitz_page, pdf_path, page_num,
         top_margin_percent=0.10, bottom_margin_percent=0.10
     )
     masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
     # ====================================================================
+    # --- STEP 3: COLUMN DETECTION (MASKED + COVERAGE CHECK) ---
     # ====================================================================
     page_width_pdf = fitz_page.rect.width
     page_height_pdf = fitz_page.rect.height
         'cluster_min_width': 20, 'cluster_threshold_percentile': 85,
     }
     separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
     page_separator_x = None
     if separators:
         central_min = page_width_pdf * 0.35
     except Exception as e:
         print(f"  ❌ Native text extraction failed: {e}")
     if not raw_ocr_output:
         if _ocr_cache.has_ocr(pdf_path, page_num):
             print(f"  ⚡ Using cached Tesseract OCR for page {page_num}")
                     'y0': y1_pix, 'x0': x1_pix
                 })
         else:
             try:
                 pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
                 hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
                 placed = True
                 break
         if not placed and item['type'] in ['equation', 'figure']:
             for line in lines:
                 y_ref = min(it['y0'] for it in line)
                 if abs(y_ref - item['y0']) < 20:
         print(f"\n✅ Final Data Saved: {final_output_path}")
     else:
         print("\n❌ Pipeline Failed.")
+        sys.exit(1)