Spaces:

Orias171
/

doc-ai-api

Sleeping

App Files Files Community

doc-ai-api / AppGenerator.py

LongK171

Add all

dbe2c62 about 2 months ago

raw

history blame contribute delete

6.45 kB

	import fitz

	from Config import Configs
	from Config import ModelLoader as ML

	from Libraries import Common_MyUtils as MU
	from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck
	from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt
	from Libraries import Faiss_Embedding as F_Embedding

	Checkpoint = "vinai/bartpho-syllable"
	service = "Categories"
	inputs = "Categories.json"
	JsonKey = "paragraphs"
	JsonField = "Text"

	config = Configs.ConfigValues(service=service, inputs=inputs)
	inputPath = config["inputPath"]
	PdfPath = config["PdfPath"]
	DocPath = config["DocPath"]
	exceptPath = config["exceptPath"]
	markerPath = config["markerPath"]
	statusPath = config["statusPath"]
	RawDataPath = config["RawDataPath"]
	RawLvlsPath = config["RawLvlsPath"]
	StructsPath = config["StructsPath"]
	SegmentPath = config["SegmentPath"]
	SchemaPath = config["SchemaPath"]
	FaissPath = config["FaissPath"]
	MappingPath = config["MappingPath"]
	MapDataPath = config["MapDataPath"]
	MapChunkPath = config["MapChunkPath"]
	MetaPath = config["MetaPath"]
	DATA_KEY = config["DATA_KEY"]
	EMBE_KEY = config["EMBE_KEY"]
	SEARCH_EGINE = config["SEARCH_EGINE"]
	RERANK_MODEL = config["RERANK_MODEL"]
	RESPON_MODEL = config["RESPON_MODEL"]
	EMBEDD_MODEL = config["EMBEDD_MODEL"]
	CHUNKS_MODEL = config["CHUNKS_MODEL"]
	SUMARY_MODEL = config["SUMARY_MODEL"]
	WORD_LIMIT = config["WORD_LIMIT"]

	MODEL_DIR = "Models"
	MODEL_ENCODE = "Sentence_Transformer"
	MODEL_SUMARY = "Summarizer"
	EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
	CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
	SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"

	MAX_INPUT = 1024
	MAX_TARGET = 256
	MIN_TARGET = 64
	TRAIN_EPOCHS = 3
	LEARNING_RATE = 3e-5
	WEIGHT_DECAY = 0.01
	BATCH_SIZE = 4

	def loadHardcodes(file_path, wanted=None):
	data = MU.read_json(file_path)
	if "items" not in data:
	return
	result = {}
	for item in data["items"]:
	key = item["key"]
	if (not wanted) or (key in wanted):
	result[key] = item["values"]
	return result

	exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
	markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
	statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])

	Loader = ML.ModelLoader()
	indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
	chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)

	dataExtractor = ExtractData.B1Extractor(
	exceptData,
	markerData,
	statusData,
	proper_name_min_count=10
	)

	structAnalyzer = GetStructures.StructureAnalyzer(
	verbose=True
	)

	chunkBuilder = ChunkMaster.ChunkBuilder()

	schemaExt = SchemaExt.JSONSchemaExtractor(
	list_policy="first",
	verbose=True
	)

	faissIndexer = F_Embedding.DirectFaissIndexer(
	indexer=indexer,
	device=str(embeddDevice),
	batch_size=32,
	show_progress=True,
	flatten_mode="split",
	join_sep="\n",
	allowed_schema_types=("string", "array", "dict"),
	max_chars_per_text=2000,
	normalize=True,
	verbose=False
	)

	def extractRun(pdf_doc):
	extractedData = dataExtractor.extract(pdf_doc)
	RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
	return RawDataDict

	def structRun(RawDataDict):
	markers = structAnalyzer.extract_markers(RawDataDict)
	structures = structAnalyzer.build_structures(markers)
	dedup = structAnalyzer.deduplicate(structures)
	top = structAnalyzer.select_top(dedup)
	RawLvlsDict = structAnalyzer.extend_top(top, dedup)
	print(MU.json_convert(RawLvlsDict, pretty=True))
	return RawLvlsDict

	def chunkRun(RawLvlsDict=None, RawDataDict=None):
	StructsDict = chunkBuilder.build(RawLvlsDict, RawDataDict)
	return StructsDict

	def SegmentRun(StructsDict, RawLvlsDict):
	first_key = list(RawLvlsDict[0].keys())[0]

	SegmentDict = []
	for item in StructsDict:
	value = item.get(first_key)
	if not value: continue

	if isinstance(value, list):
	value = " ".join(v.strip() for v in value if isinstance(v, str) and v.strip().lower() != "none")
	if value.strip():
	SegmentDict.append(item)

	for i, item in enumerate(SegmentDict, start=1):
	item["Index"] = i

	return SegmentDict

	def schemaRun(SegmentDict):
	SchemaDict = schemaExt.schemaRun(SegmentDict=SegmentDict)
	print(SchemaDict)
	return SchemaDict

	def Indexing(SchemaDict):
	Mapping, MapData = faissIndexer.build_from_json(
	SegmentPath=SegmentPath,
	SchemaDict=SchemaDict,
	FaissPath=FaissPath,
	MapDataPath=MapDataPath,
	MappingPath=MappingPath,
	MapChunkPath=MapChunkPath
	)
	return Mapping, MapData

	mode = "json"

	def Prepare():
	if mode == "pdf":
	print("\nLoading File...")
	pdf_doc = fitz.open(PdfPath)
	checker = QualityCheck.PDFQualityChecker()
	is_good, info = checker.evaluate(pdf_doc)
	print(info["status"])
	if not is_good:
	print("⚠️ Bỏ qua file này.")
	return None, None, None, None
	else:
	print("✅ Tiếp tục xử lý.")

	print("\nExtracting...")
	RawDataDict = extractRun(pdf_doc)
	MU.write_json(RawDataDict, RawDataPath, indent=1)
	pdf_doc.close()

	print("\nGetting Struct...")
	RawLvlsDict = structRun(RawDataDict)
	MU.write_json(RawLvlsDict, RawLvlsPath, indent=2)

	print("\nChunking...")
	StructsDict = chunkRun(RawLvlsDict, RawDataDict)
	MU.write_json(StructsDict, StructsPath, indent=2)

	print("\nSegmenting...")
	SegmentDict = SegmentRun(StructsDict, RawLvlsDict)
	MU.write_json(SegmentDict, SegmentPath, indent=2)
	else:
	SegmentDict = MU.read_json(SegmentPath)
	print("\nCreating Schema...")
	SchemaDict = schemaRun(SegmentDict)
	MU.write_json(SchemaDict, SchemaPath, indent=2)

	print("\nEmbedding...")
	Mapping, MapData = Indexing(SchemaDict)
	MU.write_json(Mapping, MappingPath, indent=2)
	MU.write_json(MapData, MapDataPath, indent=2)

	print("\nCompleted!")

	return SegmentDict, SchemaDict, Mapping, MapData

	SegmentDict, SchemaDict, Mapping, MapData = Prepare()