heerjtdev commited on
Commit
e9f6960
·
1 Parent(s): 4ff69d8

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +56 -36
working_yolo_pipeline.py CHANGED
@@ -1543,7 +1543,6 @@
1543
  # sys.exit(1)
1544
 
1545
 
1546
-
1547
  import json
1548
  import argparse
1549
  import os
@@ -1700,7 +1699,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
1700
  return raw_word_data
1701
 
1702
  # 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
1703
- # Note: raw_word_data is in PDF points. YOLO is in Pixels (scaled by scale_factor).
1704
  pdf_space_boxes = []
1705
  for det in yolo_detections:
1706
  x1, y1, x2, y2 = det['coords']
@@ -1715,7 +1713,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
1715
  # 2. Filter out raw words that are inside YOLO boxes
1716
  cleaned_word_data = []
1717
  for word_tuple in raw_word_data:
1718
- # word_tuple format: (text, x1, y1, x2, y2, block_no, line_no, word_no)
1719
  wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
1720
  w_center_x = (wx1 + wx2) / 2
1721
  w_center_y = (wy1 + wy2) / 2
@@ -1731,15 +1728,55 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
1731
 
1732
  # 3. Add the YOLO boxes themselves as "Solid Words"
1733
  for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
1734
- # Create a dummy tuple matching the structure: (text, x1, y1, x2, y2)
1735
  dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
1736
  cleaned_word_data.append(dummy_entry)
1737
 
1738
  return cleaned_word_data
1739
 
1740
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1741
  def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
1742
- """Calculates the X-axis histogram and detects significant gutters."""
1743
  if not word_data: return []
1744
 
1745
  x_points = []
@@ -1771,22 +1808,17 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
1771
  separator_x_coords = [int(bin_edges[p]) for p in peaks]
1772
  final_separators = []
1773
 
1774
- # Validation: Check if gap is "blocked" significantly
1775
- # Since we have YOLO blocks merged in, we can trust the histogram more.
1776
- # But we still do a quick density check.
1777
- window_bins = max(1, int(10 / bin_size))
1778
-
1779
- for peak_idx, x_coord in zip(peaks, separator_x_coords):
1780
- start_bin = max(0, peak_idx - window_bins)
1781
- end_bin = min(len(hist), peak_idx + window_bins)
1782
- raw_density_in_gutter = np.sum(hist[start_bin:end_bin])
1783
-
1784
- left_density = np.sum(hist[max(0, start_bin - 20):start_bin])
1785
- right_density = np.sum(hist[end_bin:min(len(hist), end_bin + 20)])
1786
- neighbor_avg = (left_density + right_density) / 2
1787
-
1788
- if neighbor_avg > 0 and raw_density_in_gutter < neighbor_avg * 0.4:
1789
  final_separators.append(x_coord)
 
 
 
1790
 
1791
  return sorted(final_separators)
1792
 
@@ -1862,7 +1894,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1862
  OPTIMIZED FLOW:
1863
  1. Run YOLO to find Equations/Tables.
1864
  2. Mask raw text with YOLO boxes.
1865
- 3. Run Column Detection on the MASKED data (prevents false positives in matrices).
1866
  4. Proceed with OCR and Output.
1867
  """
1868
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
@@ -1877,7 +1909,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1877
  # --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
1878
  # ====================================================================
1879
  start_time_yolo = time.time()
1880
- # Run inference
1881
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1882
 
1883
  relevant_detections = []
@@ -1891,25 +1922,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1891
  {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1892
  )
1893
 
1894
- # Merge overlapping boxes immediately
1895
  merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1896
- print(f" [LOG] YOLO found {len(merged_detections)} objects (Equation/Figure) in {time.time() - start_time_yolo:.3f}s.")
1897
 
1898
  # ====================================================================
1899
  # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1900
  # ====================================================================
1901
- # Get raw PDF words first
1902
  raw_words_for_layout = get_word_data_for_detection(
1903
  fitz_page, pdf_path, page_num,
1904
  top_margin_percent=0.10, bottom_margin_percent=0.10
1905
  )
1906
 
1907
- # KEY CHANGE: Inject YOLO boxes into the word list before detection
1908
- # Assuming 2.0 scale factor because fitz.Matrix(2.0, 2.0) was used in main loop
1909
  masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1910
 
1911
  # ====================================================================
1912
- # --- STEP 3: COLUMN DETECTION (Now running on MASKED data) ---
1913
  # ====================================================================
1914
  page_width_pdf = fitz_page.rect.width
1915
  page_height_pdf = fitz_page.rect.height
@@ -1919,10 +1946,8 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1919
  'cluster_min_width': 20, 'cluster_threshold_percentile': 85,
1920
  }
1921
 
1922
- # Use calculate_x_gutters directly on the masked data
1923
  separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1924
 
1925
- # Finalize separator logic
1926
  page_separator_x = None
1927
  if separators:
1928
  central_min = page_width_pdf * 0.35
@@ -1985,7 +2010,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1985
  except Exception as e:
1986
  print(f" ❌ Native text extraction failed: {e}")
1987
 
1988
- # If native failed, use Tesseract
1989
  if not raw_ocr_output:
1990
  if _ocr_cache.has_ocr(pdf_path, page_num):
1991
  print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
@@ -2002,7 +2026,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
2002
  'y0': y1_pix, 'x0': x1_pix
2003
  })
2004
  else:
2005
- # Run Tesseract
2006
  try:
2007
  pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
2008
  hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
@@ -2052,7 +2075,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
2052
  placed = True
2053
  break
2054
  if not placed and item['type'] in ['equation', 'figure']:
2055
- # Relaxed tolerance for big objects
2056
  for line in lines:
2057
  y_ref = min(it['y0'] for it in line)
2058
  if abs(y_ref - item['y0']) < 20:
@@ -2739,6 +2761,4 @@ if __name__ == "__main__":
2739
  print(f"\n✅ Final Data Saved: {final_output_path}")
2740
  else:
2741
  print("\n❌ Pipeline Failed.")
2742
- sys.exit(1)
2743
-
2744
-
 
1543
  # sys.exit(1)
1544
 
1545
 
 
1546
  import json
1547
  import argparse
1548
  import os
 
1699
  return raw_word_data
1700
 
1701
  # 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
 
1702
  pdf_space_boxes = []
1703
  for det in yolo_detections:
1704
  x1, y1, x2, y2 = det['coords']
 
1713
  # 2. Filter out raw words that are inside YOLO boxes
1714
  cleaned_word_data = []
1715
  for word_tuple in raw_word_data:
 
1716
  wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
1717
  w_center_x = (wx1 + wx2) / 2
1718
  w_center_y = (wy1 + wy2) / 2
 
1728
 
1729
  # 3. Add the YOLO boxes themselves as "Solid Words"
1730
  for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
 
1731
  dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
1732
  cleaned_word_data.append(dummy_entry)
1733
 
1734
  return cleaned_word_data
1735
 
1736
 
1737
+ def calculate_vertical_gap_coverage(word_data: list, sep_x: int, page_height: float, gutter_width: int = 10) -> float:
1738
+ """
1739
+ Calculates what percentage of the page's vertical text span is 'cleanly split' by the separator.
1740
+ A valid column split should split > 65% of the page verticality.
1741
+ """
1742
+ if not word_data:
1743
+ return 0.0
1744
+
1745
+ # 1. Determine the vertical span of the actual text content
1746
+ y_coords = [w[2] for w in word_data] + [w[4] for w in word_data] # y1 and y2
1747
+ min_y, max_y = min(y_coords), max(y_coords)
1748
+ total_text_height = max_y - min_y
1749
+
1750
+ if total_text_height <= 0:
1751
+ return 0.0
1752
+
1753
+ # 2. Create a boolean array representing the Y-axis
1754
+ gap_open_mask = np.ones(int(total_text_height) + 1, dtype=bool)
1755
+
1756
+ # Define the x-zone of the separator
1757
+ zone_left = sep_x - (gutter_width / 2)
1758
+ zone_right = sep_x + (gutter_width / 2)
1759
+
1760
+ # 3. Mark areas where the gap is "Blocked" by text
1761
+ offset_y = int(min_y)
1762
+
1763
+ for _, x1, y1, x2, y2 in word_data:
1764
+ # Check if this word horizontally interferes with the separator
1765
+ if x2 > zone_left and x1 < zone_right:
1766
+ y_start_idx = max(0, int(y1) - offset_y)
1767
+ y_end_idx = min(len(gap_open_mask), int(y2) - offset_y)
1768
+ if y_end_idx > y_start_idx:
1769
+ gap_open_mask[y_start_idx:y_end_idx] = False
1770
+
1771
+ # 4. Calculate coverage
1772
+ open_pixels = np.sum(gap_open_mask)
1773
+ coverage_ratio = open_pixels / len(gap_open_mask)
1774
+
1775
+ return coverage_ratio
1776
+
1777
+
1778
  def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
1779
+ """Calculates the X-axis histogram and detects significant gutters using Vertical Coverage."""
1780
  if not word_data: return []
1781
 
1782
  x_points = []
 
1808
  separator_x_coords = [int(bin_edges[p]) for p in peaks]
1809
  final_separators = []
1810
 
1811
+ for x_coord in separator_x_coords:
1812
+ # KEY CHECK: Vertical Coverage
1813
+ # A column gap must exist for >65% of the page height.
1814
+ # Single-column text (width-spanning lines) will result in ~0% coverage.
1815
+ coverage = calculate_vertical_gap_coverage(word_data, x_coord, page_height, gutter_width=min_width)
1816
+
1817
+ if coverage >= 0.65:
 
 
 
 
 
 
 
 
1818
  final_separators.append(x_coord)
1819
+ print(f" -> Separator candidate X={x_coord} ACCEPTED (Coverage: {coverage:.1%})")
1820
+ else:
1821
+ print(f" -> Separator candidate X={x_coord} REJECTED (Coverage: {coverage:.1%})")
1822
 
1823
  return sorted(final_separators)
1824
 
 
1894
  OPTIMIZED FLOW:
1895
  1. Run YOLO to find Equations/Tables.
1896
  2. Mask raw text with YOLO boxes.
1897
+ 3. Run Column Detection on the MASKED data.
1898
  4. Proceed with OCR and Output.
1899
  """
1900
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
 
1909
  # --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
1910
  # ====================================================================
1911
  start_time_yolo = time.time()
 
1912
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
1913
 
1914
  relevant_detections = []
 
1922
  {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
1923
  )
1924
 
 
1925
  merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
1926
+ print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
1927
 
1928
  # ====================================================================
1929
  # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
1930
  # ====================================================================
 
1931
  raw_words_for_layout = get_word_data_for_detection(
1932
  fitz_page, pdf_path, page_num,
1933
  top_margin_percent=0.10, bottom_margin_percent=0.10
1934
  )
1935
 
 
 
1936
  masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
1937
 
1938
  # ====================================================================
1939
+ # --- STEP 3: COLUMN DETECTION (MASKED + COVERAGE CHECK) ---
1940
  # ====================================================================
1941
  page_width_pdf = fitz_page.rect.width
1942
  page_height_pdf = fitz_page.rect.height
 
1946
  'cluster_min_width': 20, 'cluster_threshold_percentile': 85,
1947
  }
1948
 
 
1949
  separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
1950
 
 
1951
  page_separator_x = None
1952
  if separators:
1953
  central_min = page_width_pdf * 0.35
 
2010
  except Exception as e:
2011
  print(f" ❌ Native text extraction failed: {e}")
2012
 
 
2013
  if not raw_ocr_output:
2014
  if _ocr_cache.has_ocr(pdf_path, page_num):
2015
  print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
 
2026
  'y0': y1_pix, 'x0': x1_pix
2027
  })
2028
  else:
 
2029
  try:
2030
  pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
2031
  hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
 
2075
  placed = True
2076
  break
2077
  if not placed and item['type'] in ['equation', 'figure']:
 
2078
  for line in lines:
2079
  y_ref = min(it['y0'] for it in line)
2080
  if abs(y_ref - item['y0']) < 20:
 
2761
  print(f"\n✅ Final Data Saved: {final_output_path}")
2762
  else:
2763
  print("\n❌ Pipeline Failed.")
2764
+ sys.exit(1)