heerjtdev commited on
Commit
96274f8
·
verified ·
1 Parent(s): 3541663

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +298 -298
working_yolo_pipeline.py CHANGED
@@ -582,274 +582,15 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
582
 
583
 
584
 
585
- # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
586
- # page_num: int, fitz_page: fitz.Page,
587
- # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
588
- # """
589
- # OPTIMIZED FLOW:
590
- # 1. Run YOLO to find Equations/Tables.
591
- # 2. Mask raw text with YOLO boxes.
592
- # 3. Run Column Detection on the MASKED data.
593
- # 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
594
- # """
595
- # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
596
-
597
- # start_time_total = time.time()
598
-
599
- # if original_img is None:
600
- # print(f" ❌ Invalid image for page {page_num}.")
601
- # return None, None
602
-
603
- # # ====================================================================
604
- # # --- STEP 1: YOLO DETECTION ---
605
- # # ====================================================================
606
- # start_time_yolo = time.time()
607
- # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
608
-
609
- # relevant_detections = []
610
- # if results and results[0].boxes:
611
- # for box in results[0].boxes:
612
- # class_id = int(box.cls[0])
613
- # class_name = model.names[class_id]
614
- # if class_name in TARGET_CLASSES:
615
- # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
616
- # relevant_detections.append(
617
- # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
618
- # )
619
-
620
- # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
621
- # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
622
-
623
- # # ====================================================================
624
- # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
625
- # # ====================================================================
626
- # # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
627
- # raw_words_for_layout = get_word_data_for_detection(
628
- # fitz_page, pdf_path, page_num,
629
- # top_margin_percent=0.10, bottom_margin_percent=0.10
630
- # )
631
-
632
- # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
633
-
634
- # # ====================================================================
635
- # # --- STEP 3: COLUMN DETECTION ---
636
- # # ====================================================================
637
- # page_width_pdf = fitz_page.rect.width
638
- # page_height_pdf = fitz_page.rect.height
639
-
640
- # column_detection_params = {
641
- # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
642
- # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
643
- # }
644
-
645
- # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
646
-
647
- # page_separator_x = None
648
- # if separators:
649
- # central_min = page_width_pdf * 0.35
650
- # central_max = page_width_pdf * 0.65
651
- # central_separators = [s for s in separators if central_min <= s <= central_max]
652
-
653
- # if central_separators:
654
- # center_x = page_width_pdf / 2
655
- # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
656
- # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
657
- # else:
658
- # print(" ⚠️ Gutter found off-center. Ignoring.")
659
- # else:
660
- # print(" -> Single Column Layout Confirmed.")
661
-
662
- # # ====================================================================
663
- # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
664
- # # ====================================================================
665
- # start_time_components = time.time()
666
- # component_metadata = []
667
- # fig_count_page = 0
668
- # eq_count_page = 0
669
-
670
- # for detection in merged_detections:
671
- # x1, y1, x2, y2 = detection['coords']
672
- # class_name = detection['class']
673
-
674
- # if class_name == 'figure':
675
- # GLOBAL_FIGURE_COUNT += 1
676
- # counter = GLOBAL_FIGURE_COUNT
677
- # component_word = f"FIGURE{counter}"
678
- # fig_count_page += 1
679
- # elif class_name == 'equation':
680
- # GLOBAL_EQUATION_COUNT += 1
681
- # counter = GLOBAL_EQUATION_COUNT
682
- # component_word = f"EQUATION{counter}"
683
- # eq_count_page += 1
684
- # else:
685
- # continue
686
-
687
- # component_crop = original_img[y1:y2, x1:x2]
688
- # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
689
- # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
690
-
691
- # y_midpoint = (y1 + y2) // 2
692
- # component_metadata.append({
693
- # 'type': class_name, 'word': component_word,
694
- # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
695
- # 'y0': int(y_midpoint), 'x0': int(x1)
696
- # })
697
-
698
- # # ====================================================================
699
- # # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
700
- # # ====================================================================
701
- # raw_ocr_output = []
702
- # scale_factor = 2.0 # Pipeline standard scale
703
-
704
- # try:
705
- # # Try getting native text first
706
- # raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
707
- # except Exception as e:
708
- # print(f" ❌ Native text extraction failed: {e}")
709
-
710
- # # If native text is missing, fall back to OCR
711
- # if not raw_ocr_output:
712
- # if _ocr_cache.has_ocr(pdf_path, page_num):
713
- # print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
714
- # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
715
- # for word_tuple in cached_word_data:
716
- # word_text, x1, y1, x2, y2 = word_tuple
717
-
718
- # # Scale from PDF points to Pipeline Pixels (2.0)
719
- # x1_pix = int(x1 * scale_factor)
720
- # y1_pix = int(y1 * scale_factor)
721
- # x2_pix = int(x2 * scale_factor)
722
- # y2_pix = int(y2 * scale_factor)
723
-
724
- # raw_ocr_output.append({
725
- # 'type': 'text', 'word': word_text, 'confidence': 95.0,
726
- # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
727
- # 'y0': y1_pix, 'x0': x1_pix
728
- # })
729
- # else:
730
- # # === START OF OPTIMIZED OCR BLOCK ===
731
- # try:
732
- # # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
733
- # # We do this specifically for OCR accuracy, separate from the pipeline image
734
- # ocr_zoom = 4.0
735
- # pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
736
-
737
- # # Convert PyMuPDF Pixmap to OpenCV format
738
- # img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
739
- # if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
740
- # elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
741
-
742
- # # 2. Preprocess (Binarization)
743
- # # Ensure 'preprocess_image_for_ocr' is defined at top of file!
744
- # processed_img = preprocess_image_for_ocr(img_ocr_np)
745
-
746
- # # 3. Run Tesseract with Optimized Configuration
747
- # # --oem 3: Default LSTM engine
748
- # # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
749
- # custom_config = r'--oem 3 --psm 6'
750
-
751
- # hocr_data = pytesseract.image_to_data(
752
- # processed_img,
753
- # output_type=pytesseract.Output.DICT,
754
- # config=custom_config
755
- # )
756
-
757
- # for i in range(len(hocr_data['level'])):
758
- # text = hocr_data['text'][i].strip()
759
- # if text and hocr_data['conf'][i] > -1:
760
-
761
- # # 4. Coordinate Mapping
762
- # # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
763
- # # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
764
- # scale_adjustment = scale_factor / ocr_zoom
765
-
766
- # x1 = int(hocr_data['left'][i] * scale_adjustment)
767
- # y1 = int(hocr_data['top'][i] * scale_adjustment)
768
- # w = int(hocr_data['width'][i] * scale_adjustment)
769
- # h = int(hocr_data['height'][i] * scale_adjustment)
770
- # x2 = x1 + w
771
- # y2 = y1 + h
772
-
773
- # raw_ocr_output.append({
774
- # 'type': 'text',
775
- # 'word': text,
776
- # 'confidence': float(hocr_data['conf'][i]),
777
- # 'bbox': [x1, y1, x2, y2],
778
- # 'y0': y1,
779
- # 'x0': x1
780
- # })
781
- # except Exception as e:
782
- # print(f" ❌ Tesseract OCR Error: {e}")
783
- # # === END OF OPTIMIZED OCR BLOCK ===
784
-
785
- # # ====================================================================
786
- # # --- STEP 6: OCR CLEANING AND MERGING ---
787
- # # ====================================================================
788
- # items_to_sort = []
789
-
790
- # for ocr_word in raw_ocr_output:
791
- # is_suppressed = False
792
- # for component in component_metadata:
793
- # # Do not include words that are inside figure/equation boxes
794
- # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
795
- # if ioa > IOA_SUPPRESSION_THRESHOLD:
796
- # is_suppressed = True
797
- # break
798
- # if not is_suppressed:
799
- # items_to_sort.append(ocr_word)
800
-
801
- # # Add figures/equations back into the flow as "words"
802
- # items_to_sort.extend(component_metadata)
803
-
804
- # # ====================================================================
805
- # # --- STEP 7: LINE-BASED SORTING ---
806
- # # ====================================================================
807
- # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
808
- # lines = []
809
-
810
- # for item in items_to_sort:
811
- # placed = False
812
- # for line in lines:
813
- # y_ref = min(it['y0'] for it in line)
814
- # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
815
- # line.append(item)
816
- # placed = True
817
- # break
818
- # if not placed and item['type'] in ['equation', 'figure']:
819
- # for line in lines:
820
- # y_ref = min(it['y0'] for it in line)
821
- # if abs(y_ref - item['y0']) < 20:
822
- # line.append(item)
823
- # placed = True
824
- # break
825
- # if not placed:
826
- # lines.append([item])
827
-
828
- # for line in lines:
829
- # line.sort(key=lambda x: x['x0'])
830
-
831
- # final_output = []
832
- # for line in lines:
833
- # for item in line:
834
- # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
835
- # if 'tag' in item: data_item['tag'] = item['tag']
836
- # final_output.append(data_item)
837
-
838
- # return final_output, page_separator_x
839
-
840
-
841
-
842
-
843
-
844
  def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
845
  page_num: int, fitz_page: fitz.Page,
846
  pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
847
  """
848
- OPTIMIZED FLOW:
849
  1. Run YOLO to find Equations/Tables.
850
  2. Mask raw text with YOLO boxes.
851
- 3. Run Column Detection on the MASKED data (Populates OCR cache).
852
- 4. Proceed with Final OCR Output (Strictly using the cache).
853
  """
854
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
855
 
@@ -864,7 +605,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
864
  # ====================================================================
865
  start_time_yolo = time.time()
866
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
867
-
868
  relevant_detections = []
869
  if results and results[0].boxes:
870
  for box in results[0].boxes:
@@ -880,10 +621,9 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
880
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
881
 
882
  # ====================================================================
883
- # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING & CACHING) ---
884
- # This call to get_word_data_for_detection will execute Tesseract if
885
- # native words are missing, and save the result to the cache.
886
  # ====================================================================
 
887
  raw_words_for_layout = get_word_data_for_detection(
888
  fitz_page, pdf_path, page_num,
889
  top_margin_percent=0.10, bottom_margin_percent=0.10
@@ -895,21 +635,21 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
895
  # --- STEP 3: COLUMN DETECTION ---
896
  # ====================================================================
897
  page_width_pdf = fitz_page.rect.width
898
- page_height_pdf = fitz_page.rect.height
899
-
900
  column_detection_params = {
901
  'cluster_bin_size': 2, 'cluster_smoothing': 2,
902
  'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
903
  }
904
 
905
  separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
906
-
907
  page_separator_x = None
908
  if separators:
909
  central_min = page_width_pdf * 0.35
910
  central_max = page_width_pdf * 0.65
911
  central_separators = [s for s in separators if central_min <= s <= central_max]
912
-
913
  if central_separators:
914
  center_x = page_width_pdf / 2
915
  page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
@@ -956,41 +696,97 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
956
  })
957
 
958
  # ====================================================================
959
- # --- STEP 5: CACHED OCR RETRIEVAL (No Redundant Tesseract) ---
960
  # ====================================================================
961
  raw_ocr_output = []
962
- scale_factor = 2.0 # Pipeline standard scale
963
 
964
- if _ocr_cache.has_ocr(pdf_path, page_num):
965
- print(f" ⚡ Using cached OCR (Native or Tesseract) for page {page_num}")
966
- cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
967
-
968
- for word_tuple in cached_word_data:
969
- # Cache stores: (text, x1, y1, x2, y2) in PDF points (see get_word_data_for_detection)
970
- word_text, x1, y1, x2, y2 = word_tuple
971
-
972
- # Scale from PDF points back to Pipeline Pixels (2.0)
973
- x1_pix = int(x1 * scale_factor)
974
- y1_pix = int(y1 * scale_factor)
975
- x2_pix = int(x2 * scale_factor)
976
- y2_pix = int(y2 * scale_factor)
977
-
978
- raw_ocr_output.append({
979
- 'type': 'text', 'word': word_text, 'confidence': 95.0, # 95.0 is a default/placeholder confidence
980
- 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
981
- 'y0': y1_pix, 'x0': x1_pix
982
- })
983
- else:
984
- # This branch is hit only if the cache check in Step 2 failed to produce text,
985
- # meaning the page is genuinely textless or entirely composed of images/figures.
986
- print(f" ⚠️ No text found in cache for page {page_num}. Proceeding without words.")
987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988
 
989
  # ====================================================================
990
- # --- STEP 6: OCR CLEANING AND MERGING (Original Logic Unchanged) ---
991
  # ====================================================================
992
  items_to_sort = []
993
-
994
  for ocr_word in raw_ocr_output:
995
  is_suppressed = False
996
  for component in component_metadata:
@@ -1006,7 +802,7 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1006
  items_to_sort.extend(component_metadata)
1007
 
1008
  # ====================================================================
1009
- # --- STEP 7: LINE-BASED SORTING (Original Logic Unchanged) ---
1010
  # ====================================================================
1011
  items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1012
  lines = []
@@ -1045,6 +841,210 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1045
 
1046
 
1047
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1048
 
1049
 
1050
 
 
582
 
583
 
584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
586
  page_num: int, fitz_page: fitz.Page,
587
  pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
588
  """
589
+ OPTIMIZED FLOW:
590
  1. Run YOLO to find Equations/Tables.
591
  2. Mask raw text with YOLO boxes.
592
+ 3. Run Column Detection on the MASKED data.
593
+ 4. Proceed with OCR (Native or High-Res Tesseract Fallback) and Output.
594
  """
595
  global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
596
 
 
605
  # ====================================================================
606
  start_time_yolo = time.time()
607
  results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
608
+
609
  relevant_detections = []
610
  if results and results[0].boxes:
611
  for box in results[0].boxes:
 
621
  print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
622
 
623
  # ====================================================================
624
+ # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING) ---
 
 
625
  # ====================================================================
626
+ # Note: This uses the updated 'get_word_data_for_detection' which has its own optimizations
627
  raw_words_for_layout = get_word_data_for_detection(
628
  fitz_page, pdf_path, page_num,
629
  top_margin_percent=0.10, bottom_margin_percent=0.10
 
635
  # --- STEP 3: COLUMN DETECTION ---
636
  # ====================================================================
637
  page_width_pdf = fitz_page.rect.width
638
+ page_height_pdf = fitz_page.rect.height
639
+
640
  column_detection_params = {
641
  'cluster_bin_size': 2, 'cluster_smoothing': 2,
642
  'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
643
  }
644
 
645
  separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
646
+
647
  page_separator_x = None
648
  if separators:
649
  central_min = page_width_pdf * 0.35
650
  central_max = page_width_pdf * 0.65
651
  central_separators = [s for s in separators if central_min <= s <= central_max]
652
+
653
  if central_separators:
654
  center_x = page_width_pdf / 2
655
  page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
 
696
  })
697
 
698
  # ====================================================================
699
+ # --- STEP 5: HYBRID OCR (Native Text + Cached Tesseract Fallback) ---
700
  # ====================================================================
701
  raw_ocr_output = []
702
+ scale_factor = 2.0 # Pipeline standard scale
703
 
704
+ try:
705
+ # Try getting native text first
706
+ raw_ocr_output = extract_native_words_and_convert(fitz_page, scale_factor=scale_factor)
707
+ except Exception as e:
708
+ print(f" ❌ Native text extraction failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
 
710
+ # If native text is missing, fall back to OCR
711
+ if not raw_ocr_output:
712
+ if _ocr_cache.has_ocr(pdf_path, page_num):
713
+ print(f" ⚡ Using cached Tesseract OCR for page {page_num}")
714
+ cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
715
+ for word_tuple in cached_word_data:
716
+ word_text, x1, y1, x2, y2 = word_tuple
717
+
718
+ # Scale from PDF points to Pipeline Pixels (2.0)
719
+ x1_pix = int(x1 * scale_factor)
720
+ y1_pix = int(y1 * scale_factor)
721
+ x2_pix = int(x2 * scale_factor)
722
+ y2_pix = int(y2 * scale_factor)
723
+
724
+ raw_ocr_output.append({
725
+ 'type': 'text', 'word': word_text, 'confidence': 95.0,
726
+ 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
727
+ 'y0': y1_pix, 'x0': x1_pix
728
+ })
729
+ else:
730
+ # === START OF OPTIMIZED OCR BLOCK ===
731
+ try:
732
+ # 1. Re-render Page at High Resolution (Zoom 4.0 = ~300 DPI)
733
+ # We do this specifically for OCR accuracy, separate from the pipeline image
734
+ ocr_zoom = 4.0
735
+ pix_ocr = fitz_page.get_pixmap(matrix=fitz.Matrix(ocr_zoom, ocr_zoom))
736
+
737
+ # Convert PyMuPDF Pixmap to OpenCV format
738
+ img_ocr_np = np.frombuffer(pix_ocr.samples, dtype=np.uint8).reshape(pix_ocr.height, pix_ocr.width, pix_ocr.n)
739
+ if pix_ocr.n == 3: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGB2BGR)
740
+ elif pix_ocr.n == 4: img_ocr_np = cv2.cvtColor(img_ocr_np, cv2.COLOR_RGBA2BGR)
741
+
742
+ # 2. Preprocess (Binarization)
743
+ # Ensure 'preprocess_image_for_ocr' is defined at top of file!
744
+ processed_img = preprocess_image_for_ocr(img_ocr_np)
745
+
746
+ # 3. Run Tesseract with Optimized Configuration
747
+ # --oem 3: Default LSTM engine
748
+ # --psm 6: Assume a single uniform block of text (Critical for lists/questions)
749
+ custom_config = r'--oem 3 --psm 6'
750
+
751
+ hocr_data = pytesseract.image_to_data(
752
+ processed_img,
753
+ output_type=pytesseract.Output.DICT,
754
+ config=custom_config
755
+ )
756
+
757
+ for i in range(len(hocr_data['level'])):
758
+ text = hocr_data['text'][i].strip()
759
+ if text and hocr_data['conf'][i] > -1:
760
+
761
+ # 4. Coordinate Mapping
762
+ # We scanned at Zoom 4.0, but our pipeline expects Zoom 2.0.
763
+ # Scale Factor = (Target 2.0) / (Source 4.0) = 0.5
764
+ scale_adjustment = scale_factor / ocr_zoom
765
+
766
+ x1 = int(hocr_data['left'][i] * scale_adjustment)
767
+ y1 = int(hocr_data['top'][i] * scale_adjustment)
768
+ w = int(hocr_data['width'][i] * scale_adjustment)
769
+ h = int(hocr_data['height'][i] * scale_adjustment)
770
+ x2 = x1 + w
771
+ y2 = y1 + h
772
+
773
+ raw_ocr_output.append({
774
+ 'type': 'text',
775
+ 'word': text,
776
+ 'confidence': float(hocr_data['conf'][i]),
777
+ 'bbox': [x1, y1, x2, y2],
778
+ 'y0': y1,
779
+ 'x0': x1
780
+ })
781
+ except Exception as e:
782
+ print(f" ❌ Tesseract OCR Error: {e}")
783
+ # === END OF OPTIMIZED OCR BLOCK ===
784
 
785
  # ====================================================================
786
+ # --- STEP 6: OCR CLEANING AND MERGING ---
787
  # ====================================================================
788
  items_to_sort = []
789
+
790
  for ocr_word in raw_ocr_output:
791
  is_suppressed = False
792
  for component in component_metadata:
 
802
  items_to_sort.extend(component_metadata)
803
 
804
  # ====================================================================
805
+ # --- STEP 7: LINE-BASED SORTING ---
806
  # ====================================================================
807
  items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
808
  lines = []
 
841
 
842
 
843
 
844
+ # def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
845
+ # page_num: int, fitz_page: fitz.Page,
846
+ # pdf_name: str) -> Tuple[List[Dict[str, Any]], Optional[int]]:
847
+ # """
848
+ # OPTIMIZED FLOW:
849
+ # 1. Run YOLO to find Equations/Tables.
850
+ # 2. Mask raw text with YOLO boxes.
851
+ # 3. Run Column Detection on the MASKED data (Populates OCR cache).
852
+ # 4. Proceed with Final OCR Output (Strictly using the cache).
853
+ # """
854
+ # global GLOBAL_FIGURE_COUNT, GLOBAL_EQUATION_COUNT
855
+
856
+ # start_time_total = time.time()
857
+
858
+ # if original_img is None:
859
+ # print(f" ❌ Invalid image for page {page_num}.")
860
+ # return None, None
861
+
862
+ # # ====================================================================
863
+ # # --- STEP 1: YOLO DETECTION ---
864
+ # # ====================================================================
865
+ # start_time_yolo = time.time()
866
+ # results = model.predict(source=original_img, conf=CONF_THRESHOLD, imgsz=640, verbose=False)
867
+
868
+ # relevant_detections = []
869
+ # if results and results[0].boxes:
870
+ # for box in results[0].boxes:
871
+ # class_id = int(box.cls[0])
872
+ # class_name = model.names[class_id]
873
+ # if class_name in TARGET_CLASSES:
874
+ # x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
875
+ # relevant_detections.append(
876
+ # {'coords': (x1, y1, x2, y2), 'y1': y1, 'class': class_name, 'conf': float(box.conf[0])}
877
+ # )
878
+
879
+ # merged_detections = merge_overlapping_boxes(relevant_detections, IOU_MERGE_THRESHOLD)
880
+ # print(f" [LOG] YOLO found {len(merged_detections)} objects in {time.time() - start_time_yolo:.3f}s.")
881
+
882
+ # # ====================================================================
883
+ # # --- STEP 2: PREPARE DATA FOR COLUMN DETECTION (MASKING & CACHING) ---
884
+ # # This call to get_word_data_for_detection will execute Tesseract if
885
+ # # native words are missing, and save the result to the cache.
886
+ # # ====================================================================
887
+ # raw_words_for_layout = get_word_data_for_detection(
888
+ # fitz_page, pdf_path, page_num,
889
+ # top_margin_percent=0.10, bottom_margin_percent=0.10
890
+ # )
891
+
892
+ # masked_word_data = merge_yolo_into_word_data(raw_words_for_layout, merged_detections, scale_factor=2.0)
893
+
894
+ # # ====================================================================
895
+ # # --- STEP 3: COLUMN DETECTION ---
896
+ # # ====================================================================
897
+ # page_width_pdf = fitz_page.rect.width
898
+ # page_height_pdf = fitz_page.rect.height
899
+
900
+ # column_detection_params = {
901
+ # 'cluster_bin_size': 2, 'cluster_smoothing': 2,
902
+ # 'cluster_min_width': 10, 'cluster_threshold_percentile': 85,
903
+ # }
904
+
905
+ # separators = calculate_x_gutters(masked_word_data, column_detection_params, page_height_pdf)
906
+
907
+ # page_separator_x = None
908
+ # if separators:
909
+ # central_min = page_width_pdf * 0.35
910
+ # central_max = page_width_pdf * 0.65
911
+ # central_separators = [s for s in separators if central_min <= s <= central_max]
912
+
913
+ # if central_separators:
914
+ # center_x = page_width_pdf / 2
915
+ # page_separator_x = min(central_separators, key=lambda x: abs(x - center_x))
916
+ # print(f" ✅ Column Split Confirmed at X={page_separator_x:.1f}")
917
+ # else:
918
+ # print(" ⚠️ Gutter found off-center. Ignoring.")
919
+ # else:
920
+ # print(" -> Single Column Layout Confirmed.")
921
+
922
+ # # ====================================================================
923
+ # # --- STEP 4: COMPONENT EXTRACTION (Save Images) ---
924
+ # # ====================================================================
925
+ # start_time_components = time.time()
926
+ # component_metadata = []
927
+ # fig_count_page = 0
928
+ # eq_count_page = 0
929
+
930
+ # for detection in merged_detections:
931
+ # x1, y1, x2, y2 = detection['coords']
932
+ # class_name = detection['class']
933
+
934
+ # if class_name == 'figure':
935
+ # GLOBAL_FIGURE_COUNT += 1
936
+ # counter = GLOBAL_FIGURE_COUNT
937
+ # component_word = f"FIGURE{counter}"
938
+ # fig_count_page += 1
939
+ # elif class_name == 'equation':
940
+ # GLOBAL_EQUATION_COUNT += 1
941
+ # counter = GLOBAL_EQUATION_COUNT
942
+ # component_word = f"EQUATION{counter}"
943
+ # eq_count_page += 1
944
+ # else:
945
+ # continue
946
+
947
+ # component_crop = original_img[y1:y2, x1:x2]
948
+ # component_filename = f"{pdf_name}_page{page_num}_{class_name}{counter}.png"
949
+ # cv2.imwrite(os.path.join(FIGURE_EXTRACTION_DIR, component_filename), component_crop)
950
+
951
+ # y_midpoint = (y1 + y2) // 2
952
+ # component_metadata.append({
953
+ # 'type': class_name, 'word': component_word,
954
+ # 'bbox': [int(x1), int(y1), int(x2), int(y2)],
955
+ # 'y0': int(y_midpoint), 'x0': int(x1)
956
+ # })
957
+
958
+ # # ====================================================================
959
+ # # --- STEP 5: CACHED OCR RETRIEVAL (No Redundant Tesseract) ---
960
+ # # ====================================================================
961
+ # raw_ocr_output = []
962
+ # scale_factor = 2.0 # Pipeline standard scale
963
+
964
+ # if _ocr_cache.has_ocr(pdf_path, page_num):
965
+ # print(f" ⚡ Using cached OCR (Native or Tesseract) for page {page_num}")
966
+ # cached_word_data = _ocr_cache.get_ocr(pdf_path, page_num)
967
+
968
+ # for word_tuple in cached_word_data:
969
+ # # Cache stores: (text, x1, y1, x2, y2) in PDF points (see get_word_data_for_detection)
970
+ # word_text, x1, y1, x2, y2 = word_tuple
971
+
972
+ # # Scale from PDF points back to Pipeline Pixels (2.0)
973
+ # x1_pix = int(x1 * scale_factor)
974
+ # y1_pix = int(y1 * scale_factor)
975
+ # x2_pix = int(x2 * scale_factor)
976
+ # y2_pix = int(y2 * scale_factor)
977
+
978
+ # raw_ocr_output.append({
979
+ # 'type': 'text', 'word': word_text, 'confidence': 95.0, # 95.0 is a default/placeholder confidence
980
+ # 'bbox': [x1_pix, y1_pix, x2_pix, y2_pix],
981
+ # 'y0': y1_pix, 'x0': x1_pix
982
+ # })
983
+ # else:
984
+ # # This branch is hit only if the cache check in Step 2 failed to produce text,
985
+ # # meaning the page is genuinely textless or entirely composed of images/figures.
986
+ # print(f" ⚠️ No text found in cache for page {page_num}. Proceeding without words.")
987
+
988
+
989
+ # # ====================================================================
990
+ # # --- STEP 6: OCR CLEANING AND MERGING (Original Logic Unchanged) ---
991
+ # # ====================================================================
992
+ # items_to_sort = []
993
+
994
+ # for ocr_word in raw_ocr_output:
995
+ # is_suppressed = False
996
+ # for component in component_metadata:
997
+ # # Do not include words that are inside figure/equation boxes
998
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
999
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
1000
+ # is_suppressed = True
1001
+ # break
1002
+ # if not is_suppressed:
1003
+ # items_to_sort.append(ocr_word)
1004
+
1005
+ # # Add figures/equations back into the flow as "words"
1006
+ # items_to_sort.extend(component_metadata)
1007
+
1008
+ # # ====================================================================
1009
+ # # --- STEP 7: LINE-BASED SORTING (Original Logic Unchanged) ---
1010
+ # # ====================================================================
1011
+ # items_to_sort.sort(key=lambda x: (x['y0'], x['x0']))
1012
+ # lines = []
1013
+
1014
+ # for item in items_to_sort:
1015
+ # placed = False
1016
+ # for line in lines:
1017
+ # y_ref = min(it['y0'] for it in line)
1018
+ # if abs(y_ref - item['y0']) < LINE_TOLERANCE:
1019
+ # line.append(item)
1020
+ # placed = True
1021
+ # break
1022
+ # if not placed and item['type'] in ['equation', 'figure']:
1023
+ # for line in lines:
1024
+ # y_ref = min(it['y0'] for it in line)
1025
+ # if abs(y_ref - item['y0']) < 20:
1026
+ # line.append(item)
1027
+ # placed = True
1028
+ # break
1029
+ # if not placed:
1030
+ # lines.append([item])
1031
+
1032
+ # for line in lines:
1033
+ # line.sort(key=lambda x: x['x0'])
1034
+
1035
+ # final_output = []
1036
+ # for line in lines:
1037
+ # for item in line:
1038
+ # data_item = {"word": item["word"], "bbox": item["bbox"], "type": item["type"]}
1039
+ # if 'tag' in item: data_item['tag'] = item['tag']
1040
+ # final_output.append(data_item)
1041
+
1042
+ # return final_output, page_separator_x
1043
+
1044
+
1045
+
1046
+
1047
+
1048
 
1049
 
1050