Spaces:
Sleeping
Sleeping
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +56 -36
working_yolo_pipeline.py
CHANGED
|
@@ -1543,7 +1543,6 @@
|
|
| 1543 |
# sys.exit(1)
|
| 1544 |
|
| 1545 |
|
| 1546 |
-
|
| 1547 |
import json
|
| 1548 |
import argparse
|
| 1549 |
import os
|
|
@@ -1700,7 +1699,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
|
|
| 1700 |
return raw_word_data
|
| 1701 |
|
| 1702 |
# 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
|
| 1703 |
-
# Note: raw_word_data is in PDF points. YOLO is in Pixels (scaled by scale_factor).
|
| 1704 |
pdf_space_boxes = []
|
| 1705 |
for det in yolo_detections:
|
| 1706 |
x1, y1, x2, y2 = det['coords']
|
|
@@ -1715,7 +1713,6 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
|
|
| 1715 |
# 2. Filter out raw words that are inside YOLO boxes
|
| 1716 |
cleaned_word_data = []
|
| 1717 |
for word_tuple in raw_word_data:
|
| 1718 |
-
# word_tuple format: (text, x1, y1, x2, y2, block_no, line_no, word_no)
|
| 1719 |
wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
|
| 1720 |
w_center_x = (wx1 + wx2) / 2
|
| 1721 |
w_center_y = (wy1 + wy2) / 2
|
|
@@ -1731,15 +1728,55 @@ def merge_yolo_into_word_data(raw_word_data: list, yolo_detections: list, scale_
|
|
| 1731 |
|
| 1732 |
# 3. Add the YOLO boxes themselves as "Solid Words"
|
| 1733 |
for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
|
| 1734 |
-
# Create a dummy tuple matching the structure: (text, x1, y1, x2, y2)
|
| 1735 |
dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
|
| 1736 |
cleaned_word_data.append(dummy_entry)
|
| 1737 |
|
| 1738 |
return cleaned_word_data
|
| 1739 |
|
| 1740 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1741 |
def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
|
| 1742 |
-
"""Calculates the X-axis histogram and detects significant gutters."""
|
| 1743 |
if not word_data: return []
|
| 1744 |
|
| 1745 |
x_points = []
|
|
@@ -1771,22 +1808,17 @@ def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> Li
|
|
| 1771 |
separator_x_coords = [int(bin_edges[p]) for p in peaks]
|
| 1772 |
final_separators = []
|
| 1773 |
|
| 1774 |
-
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
-
|
| 1778 |
-
|
| 1779 |
-
|
| 1780 |
-
|
| 1781 |
-
end_bin = min(len(hist), peak_idx + window_bins)
|
| 1782 |
-
raw_density_in_gutter = np.sum(hist[start_bin:end_bin])
|
| 1783 |
-
|
| 1784 |
-
left_density = np.sum(hist[max(0, start_bin - 20):start_bin])
|
| 1785 |
-
right_density = np.sum(hist[end_bin:min(len(hist), end_bin + 20)])
|
| 1786 |
-
neighbor_avg = (left_density + right_density) / 2
|
| 1787 |
-
|
| 1788 |
-
if neighbor_avg > 0 and raw_density_in_gutter < neighbor_avg * 0.4:
|
| 1789 |
final_separators.append(x_coord)
|
|
|
|
|
|
|
|
|
|
| 1790 |
|
| 1791 |
return sorted(final_separators)
|
| 1792 |
|
|
@@ -1862,7 +1894,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1862 |
OPTIMIZED FLOW:
|
| 1863 |
1. Run YOLO to find Equations/Tables.
|
| 1864 |
2. Mask raw text with YOLO boxes.
|
| 1865 |
-
3. Run Column Detection on the MASKED data
|
| 1866 |
4. Proceed with OCR and Output.
|
| 1867 |
"""
|
| 1868 |
global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
|
|
@@ -1877,7 +1909,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1877 |
# --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
|
| 1878 |
# ====================================================================
|
| 1879 |
start_time_yolo = time.time()
|
| 1880 |
-
# Run inference
|
| 1881 |
results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
|
| 1882 |
|
| 1883 |
relevant_detections = []
|
|
@@ -1891,25 +1922,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1891 |
{'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
|
| 1892 |
)
|
| 1893 |
|
| 1894 |
-
# Merge overlapping boxes immediately
|
| 1895 |
merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
|
| 1896 |
-
print(f" [LOG] YOLO found {len(merged_detections)} objects
|
| 1897 |
|
| 1898 |
# ====================================================================
|
| 1899 |
# --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
|
| 1900 |
# ====================================================================
|
| 1901 |
-
# Get raw PDF words first
|
| 1902 |
raw_words_for_layout = get_word_data_for_detection(
|
| 1903 |
fitz_page, pdf_path, page_num,
|
| 1904 |
top_margin_percent=0.10, bottom_margin_percent=0.10
|
| 1905 |
)
|
| 1906 |
|
| 1907 |
-
# KEY CHANGE: Inject YOLO boxes into the word list before detection
|
| 1908 |
-
# Assuming 2.0 scale factor because fitz.Matrix(2.0, 2.0) was used in main loop
|
| 1909 |
masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
|
| 1910 |
|
| 1911 |
# ====================================================================
|
| 1912 |
-
# --- STEP 3: COLUMN DETECTION (
|
| 1913 |
# ====================================================================
|
| 1914 |
page_width_pdf = fitz_page.rect.width
|
| 1915 |
page_height_pdf = fitz_page.rect.height
|
|
@@ -1919,10 +1946,8 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1919 |
'cluster_min_width': 20, 'cluster_threshold_percentile': 85,
|
| 1920 |
}
|
| 1921 |
|
| 1922 |
-
# Use calculate_x_gutters directly on the masked data
|
| 1923 |
separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
|
| 1924 |
|
| 1925 |
-
# Finalize separator logic
|
| 1926 |
page_separator_x = None
|
| 1927 |
if separators:
|
| 1928 |
central_min = page_width_pdf * 0.35
|
|
@@ -1985,7 +2010,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1985 |
except Exception as e:
|
| 1986 |
print(f" ❌ Native text extraction failed: {e}")
|
| 1987 |
|
| 1988 |
-
# If native failed, use Tesseract
|
| 1989 |
if not raw_ocr_output:
|
| 1990 |
if _ocr_cache.has_ocr(pdf_path, page_num):
|
| 1991 |
print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
|
|
@@ -2002,7 +2026,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 2002 |
'y0': y1_pix, 'x0': x1_pix
|
| 2003 |
})
|
| 2004 |
else:
|
| 2005 |
-
# Run Tesseract
|
| 2006 |
try:
|
| 2007 |
pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
|
| 2008 |
hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
|
|
@@ -2052,7 +2075,6 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 2052 |
placed = True
|
| 2053 |
break
|
| 2054 |
if not placed and item['type'] in ['equation', 'figure']:
|
| 2055 |
-
# Relaxed tolerance for big objects
|
| 2056 |
for line in lines:
|
| 2057 |
y_ref = min(it['y0'] for it in line)
|
| 2058 |
if abs(y_ref - item['y0']) < 20:
|
|
@@ -2739,6 +2761,4 @@ if __name__ == "__main__":
|
|
| 2739 |
print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2740 |
else:
|
| 2741 |
print("\n❌ Pipeline Failed.")
|
| 2742 |
-
sys.exit(1)
|
| 2743 |
-
|
| 2744 |
-
|
|
|
|
| 1543 |
# sys.exit(1)
|
| 1544 |
|
| 1545 |
|
|
|
|
| 1546 |
import json
|
| 1547 |
import argparse
|
| 1548 |
import os
|
|
|
|
| 1699 |
return raw_word_data
|
| 1700 |
|
| 1701 |
# 1. Convert YOLO boxes (Pixels) to PDF Coordinates (Points)
|
|
|
|
| 1702 |
pdf_space_boxes = []
|
| 1703 |
for det in yolo_detections:
|
| 1704 |
x1, y1, x2, y2 = det['coords']
|
|
|
|
| 1713 |
# 2. Filter out raw words that are inside YOLO boxes
|
| 1714 |
cleaned_word_data = []
|
| 1715 |
for word_tuple in raw_word_data:
|
|
|
|
| 1716 |
wx1, wy1, wx2, wy2 = word_tuple[1], word_tuple[2], word_tuple[3], word_tuple[4]
|
| 1717 |
w_center_x = (wx1 + wx2) / 2
|
| 1718 |
w_center_y = (wy1 + wy2) / 2
|
|
|
|
| 1728 |
|
| 1729 |
# 3. Add the YOLO boxes themselves as "Solid Words"
|
| 1730 |
for i, (px1, py1, px2, py2) in enumerate(pdf_space_boxes):
|
|
|
|
| 1731 |
dummy_entry = (f"BLOCK_{i}", px1, py1, px2, py2)
|
| 1732 |
cleaned_word_data.append(dummy_entry)
|
| 1733 |
|
| 1734 |
return cleaned_word_data
|
| 1735 |
|
| 1736 |
|
| 1737 |
+
def calculate_vertical_gap_coverage(word_data: list, sep_x: int, page_height: float, gutter_width: int = 10) -> float:
|
| 1738 |
+
"""
|
| 1739 |
+
Calculates what percentage of the page's vertical text span is 'cleanly split' by the separator.
|
| 1740 |
+
A valid column split should split > 65% of the page verticality.
|
| 1741 |
+
"""
|
| 1742 |
+
if not word_data:
|
| 1743 |
+
return 0.0
|
| 1744 |
+
|
| 1745 |
+
# 1. Determine the vertical span of the actual text content
|
| 1746 |
+
y_coords = [w[2] for w in word_data] + [w[4] for w in word_data] # y1 and y2
|
| 1747 |
+
min_y, max_y = min(y_coords), max(y_coords)
|
| 1748 |
+
total_text_height = max_y - min_y
|
| 1749 |
+
|
| 1750 |
+
if total_text_height <= 0:
|
| 1751 |
+
return 0.0
|
| 1752 |
+
|
| 1753 |
+
# 2. Create a boolean array representing the Y-axis
|
| 1754 |
+
gap_open_mask = np.ones(int(total_text_height) + 1, dtype=bool)
|
| 1755 |
+
|
| 1756 |
+
# Define the x-zone of the separator
|
| 1757 |
+
zone_left = sep_x - (gutter_width / 2)
|
| 1758 |
+
zone_right = sep_x + (gutter_width / 2)
|
| 1759 |
+
|
| 1760 |
+
# 3. Mark areas where the gap is "Blocked" by text
|
| 1761 |
+
offset_y = int(min_y)
|
| 1762 |
+
|
| 1763 |
+
for _, x1, y1, x2, y2 in word_data:
|
| 1764 |
+
# Check if this word horizontally interferes with the separator
|
| 1765 |
+
if x2 > zone_left and x1 < zone_right:
|
| 1766 |
+
y_start_idx = max(0, int(y1) - offset_y)
|
| 1767 |
+
y_end_idx = min(len(gap_open_mask), int(y2) - offset_y)
|
| 1768 |
+
if y_end_idx > y_start_idx:
|
| 1769 |
+
gap_open_mask[y_start_idx:y_end_idx] = False
|
| 1770 |
+
|
| 1771 |
+
# 4. Calculate coverage
|
| 1772 |
+
open_pixels = np.sum(gap_open_mask)
|
| 1773 |
+
coverage_ratio = open_pixels / len(gap_open_mask)
|
| 1774 |
+
|
| 1775 |
+
return coverage_ratio
|
| 1776 |
+
|
| 1777 |
+
|
| 1778 |
def calculate_x_gutters(word_data: list, params: Dict, page_height: float) -> List[int]:
|
| 1779 |
+
"""Calculates the X-axis histogram and detects significant gutters using Vertical Coverage."""
|
| 1780 |
if not word_data: return []
|
| 1781 |
|
| 1782 |
x_points = []
|
|
|
|
| 1808 |
separator_x_coords = [int(bin_edges[p]) for p in peaks]
|
| 1809 |
final_separators = []
|
| 1810 |
|
| 1811 |
+
for x_coord in separator_x_coords:
|
| 1812 |
+
# KEY CHECK: Vertical Coverage
|
| 1813 |
+
# A column gap must exist for >65% of the page height.
|
| 1814 |
+
# Single-column text (width-spanning lines) will result in ~0% coverage.
|
| 1815 |
+
coverage = calculate_vertical_gap_coverage(word_data, x_coord, page_height, gutter_width=min_width)
|
| 1816 |
+
|
| 1817 |
+
if coverage >= 0.65:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1818 |
final_separators.append(x_coord)
|
| 1819 |
+
print(f" -> Separator candidate X={x_coord} ACCEPTED (Coverage: {coverage:.1%})")
|
| 1820 |
+
else:
|
| 1821 |
+
print(f" -> Separator candidate X={x_coord} REJECTED (Coverage: {coverage:.1%})")
|
| 1822 |
|
| 1823 |
return sorted(final_separators)
|
| 1824 |
|
|
|
|
| 1894 |
OPTIMIZED FLOW:
|
| 1895 |
1. Run YOLO to find Equations/Tables.
|
| 1896 |
2. Mask raw text with YOLO boxes.
|
| 1897 |
+
3. Run Column Detection on the MASKED data.
|
| 1898 |
4. Proceed with OCR and Output.
|
| 1899 |
"""
|
| 1900 |
global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
|
|
|
|
| 1909 |
# --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
|
| 1910 |
# ====================================================================
|
| 1911 |
start_time_yolo = time.time()
|
|
|
|
| 1912 |
results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
|
| 1913 |
|
| 1914 |
relevant_detections = []
|
|
|
|
| 1922 |
{'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
|
| 1923 |
)
|
| 1924 |
|
|
|
|
| 1925 |
merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
|
| 1926 |
+
print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
|
| 1927 |
|
| 1928 |
# ====================================================================
|
| 1929 |
# --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
|
| 1930 |
# ====================================================================
|
|
|
|
| 1931 |
raw_words_for_layout = get_word_data_for_detection(
|
| 1932 |
fitz_page, pdf_path, page_num,
|
| 1933 |
top_margin_percent=0.10, bottom_margin_percent=0.10
|
| 1934 |
)
|
| 1935 |
|
|
|
|
|
|
|
| 1936 |
masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
|
| 1937 |
|
| 1938 |
# ====================================================================
|
| 1939 |
+
# --- STEP 3: COLUMN DETECTION (MASKED + COVERAGE CHECK) ---
|
| 1940 |
# ====================================================================
|
| 1941 |
page_width_pdf = fitz_page.rect.width
|
| 1942 |
page_height_pdf = fitz_page.rect.height
|
|
|
|
| 1946 |
'cluster_min_width': 20, 'cluster_threshold_percentile': 85,
|
| 1947 |
}
|
| 1948 |
|
|
|
|
| 1949 |
separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
|
| 1950 |
|
|
|
|
| 1951 |
page_separator_x = None
|
| 1952 |
if separators:
|
| 1953 |
central_min = page_width_pdf * 0.35
|
|
|
|
| 2010 |
except Exception as e:
|
| 2011 |
print(f" ❌ Native text extraction failed: {e}")
|
| 2012 |
|
|
|
|
| 2013 |
if not raw_ocr_output:
|
| 2014 |
if _ocr_cache.has_ocr(pdf_path, page_num):
|
| 2015 |
print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
|
|
|
|
| 2026 |
'y0': y1_pix, 'x0': x1_pix
|
| 2027 |
})
|
| 2028 |
else:
|
|
|
|
| 2029 |
try:
|
| 2030 |
pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
|
| 2031 |
hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
|
|
|
|
| 2075 |
placed = True
|
| 2076 |
break
|
| 2077 |
if not placed and item['type'] in ['equation', 'figure']:
|
|
|
|
| 2078 |
for line in lines:
|
| 2079 |
y_ref = min(it['y0'] for it in line)
|
| 2080 |
if abs(y_ref - item['y0']) < 20:
|
|
|
|
| 2761 |
print(f"\n✅ Final Data Saved: {final_output_path}")
|
| 2762 |
else:
|
| 2763 |
print("\n❌ Pipeline Failed.")
|
| 2764 |
+
sys.exit(1)
|
|
|
|
|
|