from collections import Counter from statistics import mean, multimode # =============================== # HÀM CHÍNH # =============================== def mergeLinesToParagraphs(baseJson): """ Nhận vào JSON sau extractData (lines-level) Trả về JSON mới (paragraph-level) """ general = baseJson["general"] lines = baseJson["lines"] paragraphs = [] buffer = [] for i, curr in enumerate(lines): if not buffer: buffer.append(curr) continue prev = lines[i-1] if canMerge(prev, curr, i-1, i): buffer.append(curr) else: paragraphs.append(buildParagraph(buffer, len(paragraphs)+1, general)) buffer = [curr] if buffer: paragraphs.append(buildParagraph(buffer, len(paragraphs)+1, general)) merged = {"general": general, "paragraphs": paragraphs} # >>> TÍNH LẠI 'common' TRONG GENERAL DỰA TRÊN PARAGRAPHS merged = recomputeCommonsInGeneralAfterMerge(merged) return {"general": general, "paragraphs": paragraphs} # =============================== # CÁC HÀM ĐIỀU KIỆN MERGE # =============================== def canMerge(prev, curr, idx_prev=None, idx_curr=None): """ Kiểm tra line curr có thể merge vào prev không Ghi log lý do True/False """ pair = f"[{idx_prev+1}->{idx_curr+1}]" if idx_prev is not None else "" if isNewPara(curr): return False if not isSameFontSize(prev, curr): return False if not isSameStyle(prev, curr): return False if not isNear(prev, curr): return False if isSameAlign(prev, curr): return True if isBadAlign(prev, curr): return False if canMergeWithAlign(prev) or canMergeWithLeft(prev, curr): return True print(f"{pair} Merge=False | Reason: Fallback") return False # Check MarkerText def isNewPara(line): return line.get("MarkerText") not in (None, "", " ") # Check FontSize def isSameFontSize(prev, curr): return abs(prev["FontSize"] - curr["FontSize"]) <= 0.7 # Check Style def isSameStyle(prev, curr): return isSameLineStyle(prev, curr) or isSameFirstStyle(prev, curr) or isSameLastStyle(prev, curr) or isSameWordStyle(prev, curr) def isSameFStyle(prev, curr): return isSameLineFStyle(prev, curr) or isSameFirstFStyle(prev, curr) or isSameLastFStyle(prev, curr) or isSameWordFStyle(prev, curr) def isSameCase(prev, curr): return isSameLineCase(prev, curr) or isSameFirstCase(prev, curr) or isSameLastCase(prev, curr) or isSameWordCase(prev, curr) # Line - Line def isSameLineStyle(prev, curr): return prev["Style"] == curr["Style"] def isSameLineFStyle(prev, curr): return prev["Style"] %1000 == curr["Style"] %1000 def isSameLineCase(prev, curr): return prev["Style"] /1000 == curr["Style"] /1000 # First - Line def isSameFirstStyle(prev, curr): return prev["Style"] == curr["Words"]["First"]["Style"] def isSameFirstFStyle(prev, curr): return prev["Style"] %1000 == curr["Words"]["First"]["Style"] %1000 def isSameFirstCase(prev, curr): return prev["Style"] /1000 == curr["Words"]["First"]["Style"] /1000 # Last - Line def isSameLastStyle(prev, curr): return prev["Words"]["Last"]["Style"] == curr["Style"] def isSameLastFStyle(prev, curr): return prev["Words"]["Last"]["Style"] %1000 == curr["Style"] %1000 def isSameLastCase(prev, curr): return prev["Words"]["Last"]["Style"] /1000 == curr["Style"] /1000 # Last - First def isSameWordStyle(prev, curr): return prev["Words"]["Last"]["Style"] == curr["Words"]["First"]["Style"] def isSameWordFStyle(prev, curr): return prev["Words"]["Last"]["Style"] %1000 == curr["Words"]["First"]["Style"] %1000 def isSameWordCase(prev, curr): return prev["Words"]["Last"]["Style"] /1000 == curr["Words"]["First"]["Style"] /1000 # Linespace def isNear(prev, curr): if "Position" not in prev or "Position" not in curr: return False if "LineHeight" not in curr: return False hig_curr = curr["LineHeight"] top_prev = prev["Position"]["Top"] top_curr = curr["Position"]["Top"] bot_curr = curr["Position"]["Bot"] return (top_curr < top_prev * 2) and ((top_curr < bot_curr * 2) or bot_curr <= 3.0) and (top_curr < hig_curr * 5) def isSameAlign(prev, curr): return prev.get("Align") == curr.get("Align") def isBadAlign(prev, curr): return (prev.get("Align") != "right" and curr.get("Align") == "right") def isNoSameAlign0(prev): return prev.get("Align") == "Justify" def isNoSameAlignC(prev): return prev.get("Align") == "Center" def isNoSameAlignR(prev): return prev.get("Align") == "Right" def isNoSameAlignL(prev, curr): return prev.get("Align") == "Left" and curr.get("Align") == "Justify" def canMergeWithAlign(prev): return isNoSameAlign0(prev) or isNoSameAlignC(prev) or isNoSameAlignR(prev) def canMergeWithLeft(prev, curr): return isNoSameAlignL(prev, curr) # =============================== # HÀM BUILD PARAGRAPH # =============================== def buildParagraph(lines, para_id, general=None): """ Tạo dict Paragraph từ list lines đã merge """ text = " ".join([ln["Text"] for ln in lines]) marker_text = lines[0]["MarkerText"] marker_type = lines[0]["MarkerType"] # Style: lấy min theo từng chữ số style = mergeStyle([ln["Style"] for ln in lines]) # first_word = lines[0]["Words"]["First"] # last_word = lines[-1]["Words"]["Last"] fs_values = [ln["FontSize"] for ln in lines if ln.get("FontSize") is not None] if fs_values: modes = multimode(fs_values) # trả về list tất cả các mode if len(modes) == 1: font_size = modes[0] else: # có nhiều mode → chọn gần với commonFontSize trong general if general and general.get("commonFontSize") is not None: target = general["commonFontSize"] font_size = min(modes, key=lambda x: abs(x - target)) else: font_size = mean(fs_values) font_size = round(font_size, 1) else: font_size = 12.0 align = mostCommon([ln["Align"] for ln in lines]) or lines[-1]["Align"] return { "Paragraph": para_id, "Text": text, "MarkerText": marker_text, "MarkerType": marker_type, "Style": style, "FontSize": font_size, "Align": align, } # =============================== # HELPERS # =============================== def mergeStyle(styles): """ styles: list số 4 chữ số (CaseStyle*1000 + FontStyle) - Lấy min của từng chữ số """ digits = [list(str(s).zfill(4)) for s in styles] min_digits = [min(int(d[i]) for d in digits) for i in range(4)] return int("".join(str(d) for d in min_digits)) def mostCommon(values): if not values: return None count = Counter(values) most = count.most_common(1) return most[0][0] if most else None # =============================== # RESOLVE COMMONS # =============================== def recomputeCommonsInGeneralAfterMerge(mergedJson): """ Cập nhật lại các 'common' trong mergedJson['general'] dựa trên danh sách paragraphs. Các field cập nhật: - commonFontSize - commonFontSizes: [{FontSize, Count}, ...] (giảm dần theo Count) - commonMarkers: top marker thỏa ngưỡng >= 0.5% tổng số paragraph, tối đa 10 mục """ paragraphs = mergedJson.get("paragraphs", []) total = len(paragraphs) # --- Font sizes --- fs_values = [p["FontSize"] for p in paragraphs if p.get("FontSize") is not None] fs_counter = Counter(fs_values) commonFontSizes = [{"FontSize": round(fs, 1), "Count": cnt} for fs, cnt in fs_counter.most_common()] commonFontSize = commonFontSizes[0]["FontSize"] if commonFontSizes else None # --- Markers --- mk_values = [p["MarkerType"] for p in paragraphs if p.get("MarkerType")] mk_counter = Counter(mk_values) threshold = max(1, int(total * 0.005)) commonMarkers = [m for m, c in mk_counter.most_common(10) if c >= threshold] # --- Ghi đè vào general --- mergedJson["general"].update({ "commonFontSize": commonFontSize, "commonFontSizes": commonFontSizes, "commonMarkers": commonMarkers }) return mergedJson