heerjtdev commited on
Commit
3541663
·
verified ·
1 Parent(s): f0a088d

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +113 -110
working_yolo_pipeline.py CHANGED
@@ -579,6 +579,9 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
579
  return converted_ocr_output
580
 
581
 
 
 
 
582
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
583
  # page_num: int, fitz_page: fitz.Page,
584
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
@@ -587,7 +590,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
587
  # 1. Run YOLO to find Equations/Tables.
588
  # 2. Mask raw text with YOLO boxes.
589
  # 3. Run Column Detection on the MASKED data.
590
- # 4. Proceed with OCR and Output.
591
  # """
592
  # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
593
 
@@ -598,7 +601,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
598
  # return None, None
599
 
600
  # # ====================================================================
601
- # # --- STEP 1: YOLO DETECTION (MOVED TO FIRST STEP) ---
602
  # # ====================================================================
603
  # start_time_yolo = time.time()
604
  # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
@@ -613,16 +616,14 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
613
  # relevant_detections.append(
614
  # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
615
  # )
616
- # non_nested_detections = filter_nested_boxes(relevant_detections, ioa_threshold=0.85)
617
 
618
- # # 2. Then, run your standard overlap merge on what remains
619
- # merged_detections = merge_overlapping_boxes(non_nested_detections, IOU_MERGE_THRESHOLD)
620
- # # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
621
  # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
622
 
623
  # # ====================================================================
624
  # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
625
  # # ====================================================================
 
626
  # raw_words_for_layout = get_word_data_for_detection(
627
  # fitz_page, pdf_path, page_num,
628
  # top_margin_percent=0.10, bottom_margin_percent=0.10
@@ -631,7 +632,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
631
  # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
632
 
633
  # # ====================================================================
634
- # # --- STEP 3: COLUMN DETECTION (MASKED + COVERAGE + BRIDGING CHECK) ---
635
  # # ====================================================================
636
  # page_width_pdf = fitz_page.rect.width
637
  # page_height_pdf = fitz_page.rect.height
@@ -698,45 +699,88 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
698
  # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
699
  # # ====================================================================
700
  # raw_ocr_output = []
701
- # scale_factor = 2.0
702
 
703
  # try:
 
704
  # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
705
  # except Exception as e:
706
  # print(f" ❌ Native text extraction failed: {e}")
707
 
 
708
  # if not raw_ocr_output:
709
  # if _ocr_cache.has_ocr(pdf_path, page_num):
710
  # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
711
  # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
712
  # for word_tuple in cached_word_data:
713
  # word_text, x1, y1, x2, y2 = word_tuple
 
 
714
  # x1_pix = int(x1 * scale_factor)
715
  # y1_pix = int(y1 * scale_factor)
716
  # x2_pix = int(x2 * scale_factor)
717
  # y2_pix = int(y2 * scale_factor)
 
718
  # raw_ocr_output.append({
719
  # 'type': 'text', 'word': word_text, 'confidence': 95.0,
720
  # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
721
  # 'y0': y1_pix, 'x0': x1_pix
722
  # })
723
  # else:
 
724
  # try:
725
- # pil_img = Image.fromarray(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
726
- # hocr_data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  # for i in range(len(hocr_data['level'])):
728
  # text = hocr_data['text'][i].strip()
729
  # if text and hocr_data['conf'][i] > -1:
730
- # x1 = int(hocr_data['left'][i])
731
- # y1 = int(hocr_data['top'][i])
732
- # x2 = x1 + int(hocr_data['width'][i])
733
- # y2 = y1 + int(hocr_data['height'][i])
 
 
 
 
 
 
 
 
 
734
  # raw_ocr_output.append({
735
- # 'type': 'text', 'word': text, 'confidence': float(hocr_data['conf'][i]),
736
- # 'bbox': [x1, y1, x2, y2], 'y0': y1, 'x0': x1
 
 
 
 
737
  # })
738
  # except Exception as e:
739
  # print(f" ❌ Tesseract OCR Error: {e}")
 
740
 
741
  # # ====================================================================
742
  # # --- STEP 6: OCR CLEANING AND MERGING ---
@@ -746,6 +790,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
746
  # for ocr_word in raw_ocr_output:
747
  # is_suppressed = False
748
  # for component in component_metadata:
 
749
  # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
750
  # if ioa > IOA_SUPPRESSION_THRESHOLD:
751
  # is_suppressed = True
@@ -753,6 +798,7 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
753
  # if not is_suppressed:
754
  # items_to_sort.append(ocr_word)
755
 
 
756
  # items_to_sort.extend(component_metadata)
757
 
758
  # # ====================================================================
@@ -799,11 +845,11 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
799
  page_num: int, fitz_page: fitz.Page,
800
  pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
801
  """
802
- OPTIMIZED FLOW:
803
  1. Run YOLO to find Equations/Tables.
804
  2. Mask raw text with YOLO boxes.
805
- 3. Run Column Detection on the MASKED data.
806
- 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
807
  """
808
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
809
 
@@ -818,7 +864,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
818
  # ====================================================================
819
  start_time_yolo = time.time()
820
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
821
-
822
  relevant_detections = []
823
  if results and results[0].boxes:
824
  for box in results[0].boxes:
@@ -834,9 +880,10 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
834
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
835
 
836
  # ====================================================================
837
- # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
 
 
838
  # ====================================================================
839
- # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
840
  raw_words_for_layout = get_word_data_for_detection(
841
  fitz_page, pdf_path, page_num,
842
  top_margin_percent=0.10, bottom_margin_percent=0.10
@@ -848,21 +895,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
848
  # --- STEP 3: COLUMN DETECTION ---
849
  # ====================================================================
850
  page_width_pdf = fitz_page.rect.width
851
- page_height_pdf = fitz_page.rect.height
852
-
853
  column_detection_params = {
854
  'cluster_bin_size': 2, 'cluster_smoothing': 2,
855
  'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
856
  }
857
 
858
  separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
859
-
860
  page_separator_x = None
861
  if separators:
862
  central_min = page_width_pdf * 0.35
863
  central_max = page_width_pdf * 0.65
864
  central_separators = [s for s in separators if central_min <= s <= central_max]
865
-
866
  if central_separators:
867
  center_x = page_width_pdf / 2
868
  page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
@@ -909,97 +956,41 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
909
  })
910
 
911
  # ====================================================================
912
- # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
913
  # ====================================================================
914
  raw_ocr_output = []
915
- scale_factor = 2.0 # Pipeline standard scale
916
 
917
- try:
918
- # Try getting native text first
919
- raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
920
- except Exception as e:
921
- print(f" ❌ Native text extraction failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
 
923
- # If native text is missing, fall back to OCR
924
- if not raw_ocr_output:
925
- if _ocr_cache.has_ocr(pdf_path, page_num):
926
- print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
927
- cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
928
- for word_tuple in cached_word_data:
929
- word_text, x1, y1, x2, y2 = word_tuple
930
-
931
- # Scale from PDF points to Pipeline Pixels (2.0)
932
- x1_pix = int(x1 * scale_factor)
933
- y1_pix = int(y1 * scale_factor)
934
- x2_pix = int(x2 * scale_factor)
935
- y2_pix = int(y2 * scale_factor)
936
-
937
- raw_ocr_output.append({
938
- 'type': 'text', 'word': word_text, 'confidence': 95.0,
939
- 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
940
- 'y0': y1_pix, 'x0': x1_pix
941
- })
942
- else:
943
- # === START OF OPTIMIZED OCR BLOCK ===
944
- try:
945
- # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
946
- # We do this specifically for OCR accuracy, separate from the pipeline image
947
- ocr_zoom = 4.0
948
- pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
949
-
950
- # Convert PyMuPDF Pixmap to OpenCV format
951
- img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
952
- if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
953
- elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
954
-
955
- # 2. Preprocess (Binarization)
956
- # Ensure 'preprocess_image_for_ocr' is defined at top of file!
957
- processed_img = preprocess_image_for_ocr(img_ocr_np)
958
-
959
- # 3. Run Tesseract with Optimized Configuration
960
- # --oem 3: Default LSTM engine
961
- # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
962
- custom_config = r'--oem 3 --psm 6'
963
-
964
- hocr_data = pytesseract.image_to_data(
965
- processed_img,
966
- output_type=pytesseract.Output.DICT,
967
- config=custom_config
968
- )
969
-
970
- for i in range(len(hocr_data['level'])):
971
- text = hocr_data['text'][i].strip()
972
- if text and hocr_data['conf'][i] > -1:
973
-
974
- # 4. Coordinate Mapping
975
- # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
976
- # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
977
- scale_adjustment = scale_factor / ocr_zoom
978
-
979
- x1 = int(hocr_data['left'][i] * scale_adjustment)
980
- y1 = int(hocr_data['top'][i] * scale_adjustment)
981
- w = int(hocr_data['width'][i] * scale_adjustment)
982
- h = int(hocr_data['height'][i] * scale_adjustment)
983
- x2 = x1 + w
984
- y2 = y1 + h
985
-
986
- raw_ocr_output.append({
987
- 'type': 'text',
988
- 'word': text,
989
- 'confidence': float(hocr_data['conf'][i]),
990
- 'bbox': [x1, y1, x2, y2],
991
- 'y0': y1,
992
- 'x0': x1
993
- })
994
- except Exception as e:
995
- print(f" ❌ Tesseract OCR Error: {e}")
996
- # === END OF OPTIMIZED OCR BLOCK ===
997
 
998
  # ====================================================================
999
- # --- STEP 6: OCR CLEANING AND MERGING ---
1000
  # ====================================================================
1001
  items_to_sort = []
1002
-
1003
  for ocr_word in raw_ocr_output:
1004
  is_suppressed = False
1005
  for component in component_metadata:
@@ -1015,7 +1006,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1015
  items_to_sort.extend(component_metadata)
1016
 
1017
  # ====================================================================
1018
- # --- STEP 7: LINE-BASED SORTING ---
1019
  # ====================================================================
1020
  items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1021
  lines = []
@@ -1051,6 +1042,18 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1051
  return final_output, page_separator_x
1052
 
1053
 
 
 
 
 
 
 
 
 
 
 
 
 
1054
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1055
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1056
 
 
579
  return converted_ocr_output
580
 
581
 
582
+
583
+
584
+
585
  # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
586
  # page_num: int, fitz_page: fitz.Page,
587
  # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
 
590
  # 1. Run YOLO to find Equations/Tables.
591
  # 2. Mask raw text with YOLO boxes.
592
  # 3. Run Column Detection on the MASKED data.
593
+ # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
594
  # """
595
  # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
596
 
 
601
  # return None, None
602
 
603
  # # ====================================================================
604
+ # # --- STEP 1: YOLO DETECTION ---
605
  # # ====================================================================
606
  # start_time_yolo = time.time()
607
  # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
 
616
  # relevant_detections.append(
617
  # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
618
  # )
 
619
 
620
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
 
 
621
  # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
622
 
623
  # # ====================================================================
624
  # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
625
  # # ====================================================================
626
+ # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
627
  # raw_words_for_layout = get_word_data_for_detection(
628
  # fitz_page, pdf_path, page_num,
629
  # top_margin_percent=0.10, bottom_margin_percent=0.10
 
632
  # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
633
 
634
  # # ====================================================================
635
+ # # --- STEP 3: COLUMN DETECTION ---
636
  # # ====================================================================
637
  # page_width_pdf = fitz_page.rect.width
638
  # page_height_pdf = fitz_page.rect.height
 
699
  # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
700
  # # ====================================================================
701
  # raw_ocr_output = []
702
+ # scale_factor = 2.0 # Pipeline standard scale
703
 
704
  # try:
705
+ # # Try getting native text first
706
  # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
707
  # except Exception as e:
708
  # print(f" ❌ Native text extraction failed: {e}")
709
 
710
+ # # If native text is missing, fall back to OCR
711
  # if not raw_ocr_output:
712
  # if _ocr_cache.has_ocr(pdf_path, page_num):
713
  # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
714
  # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
715
  # for word_tuple in cached_word_data:
716
  # word_text, x1, y1, x2, y2 = word_tuple
717
+
718
+ # # Scale from PDF points to Pipeline Pixels (2.0)
719
  # x1_pix = int(x1 * scale_factor)
720
  # y1_pix = int(y1 * scale_factor)
721
  # x2_pix = int(x2 * scale_factor)
722
  # y2_pix = int(y2 * scale_factor)
723
+
724
  # raw_ocr_output.append({
725
  # 'type': 'text', 'word': word_text, 'confidence': 95.0,
726
  # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
727
  # 'y0': y1_pix, 'x0': x1_pix
728
  # })
729
  # else:
730
+ # # === START OF OPTIMIZED OCR BLOCK ===
731
  # try:
732
+ # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
733
+ # # We do this specifically for OCR accuracy, separate from the pipeline image
734
+ # ocr_zoom = 4.0
735
+ # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
736
+
737
+ # # Convert PyMuPDF Pixmap to OpenCV format
738
+ # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
739
+ # if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
740
+ # elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
741
+
742
+ # # 2. Preprocess (Binarization)
743
+ # # Ensure 'preprocess_image_for_ocr' is defined at top of file!
744
+ # processed_img = preprocess_image_for_ocr(img_ocr_np)
745
+
746
+ # # 3. Run Tesseract with Optimized Configuration
747
+ # # --oem 3: Default LSTM engine
748
+ # # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
749
+ # custom_config = r'--oem 3 --psm 6'
750
+
751
+ # hocr_data = pytesseract.image_to_data(
752
+ # processed_img,
753
+ # output_type=pytesseract.Output.DICT,
754
+ # config=custom_config
755
+ # )
756
+
757
  # for i in range(len(hocr_data['level'])):
758
  # text = hocr_data['text'][i].strip()
759
  # if text and hocr_data['conf'][i] > -1:
760
+
761
+ # # 4. Coordinate Mapping
762
+ # # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
763
+ # # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
764
+ # scale_adjustment = scale_factor / ocr_zoom
765
+
766
+ # x1 = int(hocr_data['left'][i] * scale_adjustment)
767
+ # y1 = int(hocr_data['top'][i] * scale_adjustment)
768
+ # w = int(hocr_data['width'][i] * scale_adjustment)
769
+ # h = int(hocr_data['height'][i] * scale_adjustment)
770
+ # x2 = x1 + w
771
+ # y2 = y1 + h
772
+
773
  # raw_ocr_output.append({
774
+ # 'type': 'text',
775
+ # 'word': text,
776
+ # 'confidence': float(hocr_data['conf'][i]),
777
+ # 'bbox': [x1, y1, x2, y2],
778
+ # 'y0': y1,
779
+ # 'x0': x1
780
  # })
781
  # except Exception as e:
782
  # print(f" ❌ Tesseract OCR Error: {e}")
783
+ # # === END OF OPTIMIZED OCR BLOCK ===
784
 
785
  # # ====================================================================
786
  # # --- STEP 6: OCR CLEANING AND MERGING ---
 
790
  # for ocr_word in raw_ocr_output:
791
  # is_suppressed = False
792
  # for component in component_metadata:
793
+ # # Do not include words that are inside figure/equation boxes
794
  # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
795
  # if ioa > IOA_SUPPRESSION_THRESHOLD:
796
  # is_suppressed = True
 
798
  # if not is_suppressed:
799
  # items_to_sort.append(ocr_word)
800
 
801
+ # # Add figures/equations back into the flow as "words"
802
  # items_to_sort.extend(component_metadata)
803
 
804
  # # ====================================================================
 
845
  page_num: int, fitz_page: fitz.Page,
846
  pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
847
  """
848
+ OPTIMIZED FLOW:
849
  1. Run YOLO to find Equations/Tables.
850
  2. Mask raw text with YOLO boxes.
851
+ 3. Run Column Detection on the MASKED data (Populates OCR cache).
852
+ 4. Proceed with Final OCR Output (Strictly using the cache).
853
  """
854
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
855
 
 
864
  # ====================================================================
865
  start_time_yolo = time.time()
866
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
867
+
868
  relevant_detections = []
869
  if results and results[0].boxes:
870
  for box in results[0].boxes:
 
880
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
881
 
882
  # ====================================================================
883
+ # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING & CACHING) ---
884
+ # This call to get_word_data_for_detection will execute Tesseract if
885
+ # native words are missing, and save the result to the cache.
886
  # ====================================================================
 
887
  raw_words_for_layout = get_word_data_for_detection(
888
  fitz_page, pdf_path, page_num,
889
  top_margin_percent=0.10, bottom_margin_percent=0.10
 
895
  # --- STEP 3: COLUMN DETECTION ---
896
  # ====================================================================
897
  page_width_pdf = fitz_page.rect.width
898
+ page_height_pdf = fitz_page.rect.height
899
+
900
  column_detection_params = {
901
  'cluster_bin_size': 2, 'cluster_smoothing': 2,
902
  'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
903
  }
904
 
905
  separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
906
+
907
  page_separator_x = None
908
  if separators:
909
  central_min = page_width_pdf * 0.35
910
  central_max = page_width_pdf * 0.65
911
  central_separators = [s for s in separators if central_min <= s <= central_max]
912
+
913
  if central_separators:
914
  center_x = page_width_pdf / 2
915
  page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
 
956
  })
957
 
958
  # ====================================================================
959
+ # --- STEP 5: CACHED OCR RETRIEVAL (No Redundant Tesseract) ---
960
  # ====================================================================
961
  raw_ocr_output = []
962
+ scale_factor = 2.0 # Pipeline standard scale
963
 
964
+ if _ocr_cache.has_ocr(pdf_path, page_num):
965
+ print(f" ⚡ Using cached OCR (Native or Tesseract) for page {page_num}")
966
+ cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
967
+
968
+ for word_tuple in cached_word_data:
969
+ # Cache stores: (text, x1, y1, x2, y2) in PDF points (see get_word_data_for_detection)
970
+ word_text, x1, y1, x2, y2 = word_tuple
971
+
972
+ # Scale from PDF points back to Pipeline Pixels (2.0)
973
+ x1_pix = int(x1 * scale_factor)
974
+ y1_pix = int(y1 * scale_factor)
975
+ x2_pix = int(x2 * scale_factor)
976
+ y2_pix = int(y2 * scale_factor)
977
+
978
+ raw_ocr_output.append({
979
+ 'type': 'text', 'word': word_text, 'confidence': 95.0, # 95.0 is a default/placeholder confidence
980
+ 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
981
+ 'y0': y1_pix, 'x0': x1_pix
982
+ })
983
+ else:
984
+ # This branch is hit only if the cache check in Step 2 failed to produce text,
985
+ # meaning the page is genuinely textless or entirely composed of images/figures.
986
+ print(f" ⚠️ No text found in cache for page {page_num}. Proceeding without words.")
987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988
 
989
  # ====================================================================
990
+ # --- STEP 6: OCR CLEANING AND MERGING (Original Logic Unchanged) ---
991
  # ====================================================================
992
  items_to_sort = []
993
+
994
  for ocr_word in raw_ocr_output:
995
  is_suppressed = False
996
  for component in component_metadata:
 
1006
  items_to_sort.extend(component_metadata)
1007
 
1008
  # ====================================================================
1009
+ # --- STEP 7: LINE-BASED SORTING (Original Logic Unchanged) ---
1010
  # ====================================================================
1011
  items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1012
  lines = []
 
1042
  return final_output, page_separator_x
1043
 
1044
 
1045
+
1046
+
1047
+
1048
+
1049
+
1050
+
1051
+
1052
+
1053
+
1054
+
1055
+
1056
+
1057
  def run_single_pdf_preprocessing(pdf_path: str, preprocessed_json_path: str) -> Optional[str]:
1058
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
1059