Spaces:

Orias171
/

doc-ai-api

Sleeping

App Files Files Community

doc-ai-api / Libraries /Json_ChunkMaster.py

LongK171

Add all

dbe2c62 about 2 months ago

raw

history blame contribute delete

3.19 kB

	from collections import OrderedDict
	from copy import deepcopy

	class ChunkBuilder:
	def readInput(self, RawLvlsDict=None, RawDataDict=None):
	# Đọc dữ liệu

	self.struct_spec = RawLvlsDict[0]
	self.paragraphs = sorted(
	RawDataDict.get("paragraphs", []),
	key=lambda x: x.get("Paragraph", 0)
	)

	# Chuẩn bị cấu trúc
	self.ordered_fields = list(self.struct_spec.keys())
	self.last_field = self.ordered_fields[-1]
	self.level_fields = self.ordered_fields[:-1]

	# Tập marker cho từng field
	self.marker_dict = {}
	for fld in self.ordered_fields:
	vals = self.struct_spec.get(fld, [])
	self.marker_dict[fld] = set(vals) if isinstance(vals, list) else set()

	# Biến tạm
	self.StructDict = []
	self.index_counter = 1

	# ===== Các hàm tiện ích =====
	def _new_temp(self):
	return {fld: "" for fld in self.level_fields} \| {self.last_field: []}

	def _temp_has_data(self, temp):
	return any(temp[f].strip() for f in self.level_fields) or bool(temp[self.last_field])

	def _reset_deeper(self, temp, touched_field):
	idx = self.level_fields.index(touched_field)
	for f in self.level_fields[idx+1:]:
	temp[f] = ""
	temp[self.last_field] = []

	def _has_data_from_level(self, temp, fld):
	"""Kiểm tra từ level fld trở xuống có dữ liệu không"""
	if fld not in self.level_fields:
	return False
	idx = self.level_fields.index(fld)
	for f in self.level_fields[idx:]:
	if temp[f].strip():
	return True
	if temp[self.last_field]:
	return True
	return False

	def _with_index(self, temp, idx):
	"""Tạo OrderedDict với Index đứng đầu"""
	od = OrderedDict()
	od["Index"] = idx
	for f in self.level_fields:
	od[f] = temp[f]
	od[self.last_field] = temp[self.last_field]
	return od

	# ===== Hàm chính =====
	def build(self, RawLvlsDict=None, RawDataDict=None):
	self.readInput(RawLvlsDict, RawDataDict)
	temp = self._new_temp()
	for p in self.paragraphs:
	text = p.get("Text") or ""
	marker = p.get("MarkerType", None) or "none"

	matched_field = None
	for fld in self.level_fields:
	if marker in self.marker_dict.get(fld, set()):
	matched_field = fld
	break

	if matched_field is not None:
	if self._has_data_from_level(temp, matched_field):
	self.StructDict.append(self._with_index(deepcopy(temp), self.index_counter))
	self.index_counter += 1

	temp[matched_field] = text
	self._reset_deeper(temp, matched_field)
	else:
	temp[self.last_field].append(text)

	if self._temp_has_data(temp):
	self.StructDict.append(self._with_index(deepcopy(temp), self.index_counter))
	self.index_counter += 1

	return self.StructDict