doc-ai-api / AppGenerator.py
LongK171's picture
Add all
dbe2c62
import fitz
from Config import Configs
from Config import ModelLoader as ML
from Libraries import Common_MyUtils as MU
from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck
from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt
from Libraries import Faiss_Embedding as F_Embedding
Checkpoint = "vinai/bartpho-syllable"
service = "Categories"
inputs = "Categories.json"
JsonKey = "paragraphs"
JsonField = "Text"
config = Configs.ConfigValues(service=service, inputs=inputs)
inputPath = config["inputPath"]
PdfPath = config["PdfPath"]
DocPath = config["DocPath"]
exceptPath = config["exceptPath"]
markerPath = config["markerPath"]
statusPath = config["statusPath"]
RawDataPath = config["RawDataPath"]
RawLvlsPath = config["RawLvlsPath"]
StructsPath = config["StructsPath"]
SegmentPath = config["SegmentPath"]
SchemaPath = config["SchemaPath"]
FaissPath = config["FaissPath"]
MappingPath = config["MappingPath"]
MapDataPath = config["MapDataPath"]
MapChunkPath = config["MapChunkPath"]
MetaPath = config["MetaPath"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
CHUNKS_MODEL = config["CHUNKS_MODEL"]
SUMARY_MODEL = config["SUMARY_MODEL"]
WORD_LIMIT = config["WORD_LIMIT"]
MODEL_DIR = "Models"
MODEL_ENCODE = "Sentence_Transformer"
MODEL_SUMARY = "Summarizer"
EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"
MAX_INPUT = 1024
MAX_TARGET = 256
MIN_TARGET = 64
TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 4
def loadHardcodes(file_path, wanted=None):
data = MU.read_json(file_path)
if "items" not in data:
return
result = {}
for item in data["items"]:
key = item["key"]
if (not wanted) or (key in wanted):
result[key] = item["values"]
return result
exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])
Loader = ML.ModelLoader()
indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)
dataExtractor = ExtractData.B1Extractor(
exceptData,
markerData,
statusData,
proper_name_min_count=10
)
structAnalyzer = GetStructures.StructureAnalyzer(
verbose=True
)
chunkBuilder = ChunkMaster.ChunkBuilder()
schemaExt = SchemaExt.JSONSchemaExtractor(
list_policy="first",
verbose=True
)
faissIndexer = F_Embedding.DirectFaissIndexer(
indexer=indexer,
device=str(embeddDevice),
batch_size=32,
show_progress=True,
flatten_mode="split",
join_sep="\n",
allowed_schema_types=("string", "array", "dict"),
max_chars_per_text=2000,
normalize=True,
verbose=False
)
def extractRun(pdf_doc):
extractedData = dataExtractor.extract(pdf_doc)
RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
return RawDataDict
def structRun(RawDataDict):
markers = structAnalyzer.extract_markers(RawDataDict)
structures = structAnalyzer.build_structures(markers)
dedup = structAnalyzer.deduplicate(structures)
top = structAnalyzer.select_top(dedup)
RawLvlsDict = structAnalyzer.extend_top(top, dedup)
print(MU.json_convert(RawLvlsDict, pretty=True))
return RawLvlsDict
def chunkRun(RawLvlsDict=None, RawDataDict=None):
StructsDict = chunkBuilder.build(RawLvlsDict, RawDataDict)
return StructsDict
def SegmentRun(StructsDict, RawLvlsDict):
first_key = list(RawLvlsDict[0].keys())[0]
SegmentDict = []
for item in StructsDict:
value = item.get(first_key)
if not value: continue
if isinstance(value, list):
value = " ".join(v.strip() for v in value if isinstance(v, str) and v.strip().lower() != "none")
if value.strip():
SegmentDict.append(item)
for i, item in enumerate(SegmentDict, start=1):
item["Index"] = i
return SegmentDict
def schemaRun(SegmentDict):
SchemaDict = schemaExt.schemaRun(SegmentDict=SegmentDict)
print(SchemaDict)
return SchemaDict
def Indexing(SchemaDict):
Mapping, MapData = faissIndexer.build_from_json(
SegmentPath=SegmentPath,
SchemaDict=SchemaDict,
FaissPath=FaissPath,
MapDataPath=MapDataPath,
MappingPath=MappingPath,
MapChunkPath=MapChunkPath
)
return Mapping, MapData
mode = "json"
def Prepare():
if mode == "pdf":
print("\nLoading File...")
pdf_doc = fitz.open(PdfPath)
checker = QualityCheck.PDFQualityChecker()
is_good, info = checker.evaluate(pdf_doc)
print(info["status"])
if not is_good:
print("⚠️ Bỏ qua file này.")
return None, None, None, None
else:
print("✅ Tiếp tục xử lý.")
print("\nExtracting...")
RawDataDict = extractRun(pdf_doc)
MU.write_json(RawDataDict, RawDataPath, indent=1)
pdf_doc.close()
print("\nGetting Struct...")
RawLvlsDict = structRun(RawDataDict)
MU.write_json(RawLvlsDict, RawLvlsPath, indent=2)
print("\nChunking...")
StructsDict = chunkRun(RawLvlsDict, RawDataDict)
MU.write_json(StructsDict, StructsPath, indent=2)
print("\nSegmenting...")
SegmentDict = SegmentRun(StructsDict, RawLvlsDict)
MU.write_json(SegmentDict, SegmentPath, indent=2)
else:
SegmentDict = MU.read_json(SegmentPath)
print("\nCreating Schema...")
SchemaDict = schemaRun(SegmentDict)
MU.write_json(SchemaDict, SchemaPath, indent=2)
print("\nEmbedding...")
Mapping, MapData = Indexing(SchemaDict)
MU.write_json(Mapping, MappingPath, indent=2)
MU.write_json(MapData, MapDataPath, indent=2)
print("\nCompleted!")
return SegmentDict, SchemaDict, Mapping, MapData
SegmentDict, SchemaDict, Mapping, MapData = Prepare()