doc-ai-api / Libraries /Json_ChunkMaster.py
LongK171's picture
Add all
dbe2c62
from collections import OrderedDict
from copy import deepcopy
class ChunkBuilder:
def readInput(self, RawLvlsDict=None, RawDataDict=None):
# Đọc dữ liệu
self.struct_spec = RawLvlsDict[0]
self.paragraphs = sorted(
RawDataDict.get("paragraphs", []),
key=lambda x: x.get("Paragraph", 0)
)
# Chuẩn bị cấu trúc
self.ordered_fields = list(self.struct_spec.keys())
self.last_field = self.ordered_fields[-1]
self.level_fields = self.ordered_fields[:-1]
# Tập marker cho từng field
self.marker_dict = {}
for fld in self.ordered_fields:
vals = self.struct_spec.get(fld, [])
self.marker_dict[fld] = set(vals) if isinstance(vals, list) else set()
# Biến tạm
self.StructDict = []
self.index_counter = 1
# ===== Các hàm tiện ích =====
def _new_temp(self):
return {fld: "" for fld in self.level_fields} | {self.last_field: []}
def _temp_has_data(self, temp):
return any(temp[f].strip() for f in self.level_fields) or bool(temp[self.last_field])
def _reset_deeper(self, temp, touched_field):
idx = self.level_fields.index(touched_field)
for f in self.level_fields[idx+1:]:
temp[f] = ""
temp[self.last_field] = []
def _has_data_from_level(self, temp, fld):
"""Kiểm tra từ level fld trở xuống có dữ liệu không"""
if fld not in self.level_fields:
return False
idx = self.level_fields.index(fld)
for f in self.level_fields[idx:]:
if temp[f].strip():
return True
if temp[self.last_field]:
return True
return False
def _with_index(self, temp, idx):
"""Tạo OrderedDict với Index đứng đầu"""
od = OrderedDict()
od["Index"] = idx
for f in self.level_fields:
od[f] = temp[f]
od[self.last_field] = temp[self.last_field]
return od
# ===== Hàm chính =====
def build(self, RawLvlsDict=None, RawDataDict=None):
self.readInput(RawLvlsDict, RawDataDict)
temp = self._new_temp()
for p in self.paragraphs:
text = p.get("Text") or ""
marker = p.get("MarkerType", None) or "none"
matched_field = None
for fld in self.level_fields:
if marker in self.marker_dict.get(fld, set()):
matched_field = fld
break
if matched_field is not None:
if self._has_data_from_level(temp, matched_field):
self.StructDict.append(self._with_index(deepcopy(temp), self.index_counter))
self.index_counter += 1
temp[matched_field] = text
self._reset_deeper(temp, matched_field)
else:
temp[self.last_field].append(text)
if self._temp_has_data(temp):
self.StructDict.append(self._with_index(deepcopy(temp), self.index_counter))
self.index_counter += 1
return self.StructDict