import fitz from Config import Configs from Config import ModelLoader as ML from Libraries import Common_MyUtils as MU from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt from Libraries import Faiss_Embedding as F_Embedding Checkpoint = "vinai/bartpho-syllable" service = "Categories" inputs = "Categories.json" JsonKey = "paragraphs" JsonField = "Text" config = Configs.ConfigValues(service=service, inputs=inputs) inputPath = config["inputPath"] PdfPath = config["PdfPath"] DocPath = config["DocPath"] exceptPath = config["exceptPath"] markerPath = config["markerPath"] statusPath = config["statusPath"] RawDataPath = config["RawDataPath"] RawLvlsPath = config["RawLvlsPath"] StructsPath = config["StructsPath"] SegmentPath = config["SegmentPath"] SchemaPath = config["SchemaPath"] FaissPath = config["FaissPath"] MappingPath = config["MappingPath"] MapDataPath = config["MapDataPath"] MapChunkPath = config["MapChunkPath"] MetaPath = config["MetaPath"] DATA_KEY = config["DATA_KEY"] EMBE_KEY = config["EMBE_KEY"] SEARCH_EGINE = config["SEARCH_EGINE"] RERANK_MODEL = config["RERANK_MODEL"] RESPON_MODEL = config["RESPON_MODEL"] EMBEDD_MODEL = config["EMBEDD_MODEL"] CHUNKS_MODEL = config["CHUNKS_MODEL"] SUMARY_MODEL = config["SUMARY_MODEL"] WORD_LIMIT = config["WORD_LIMIT"] MODEL_DIR = "Models" MODEL_ENCODE = "Sentence_Transformer" MODEL_SUMARY = "Summarizer" EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}" CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}" SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}" MAX_INPUT = 1024 MAX_TARGET = 256 MIN_TARGET = 64 TRAIN_EPOCHS = 3 LEARNING_RATE = 3e-5 WEIGHT_DECAY = 0.01 BATCH_SIZE = 4 def loadHardcodes(file_path, wanted=None): data = MU.read_json(file_path) if "items" not in data: return result = {} for item in data["items"]: key = item["key"] if (not wanted) or (key in wanted): result[key] = item["values"] return result exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"]) markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"]) statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"]) Loader = ML.ModelLoader() indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL) chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL) dataExtractor = ExtractData.B1Extractor( exceptData, markerData, statusData, proper_name_min_count=10 ) structAnalyzer = GetStructures.StructureAnalyzer( verbose=True ) chunkBuilder = ChunkMaster.ChunkBuilder() schemaExt = SchemaExt.JSONSchemaExtractor( list_policy="first", verbose=True ) faissIndexer = F_Embedding.DirectFaissIndexer( indexer=indexer, device=str(embeddDevice), batch_size=32, show_progress=True, flatten_mode="split", join_sep="\n", allowed_schema_types=("string", "array", "dict"), max_chars_per_text=2000, normalize=True, verbose=False ) def extractRun(pdf_doc): extractedData = dataExtractor.extract(pdf_doc) RawDataDict = MergeData.mergeLinesToParagraphs(extractedData) return RawDataDict def structRun(RawDataDict): markers = structAnalyzer.extract_markers(RawDataDict) structures = structAnalyzer.build_structures(markers) dedup = structAnalyzer.deduplicate(structures) top = structAnalyzer.select_top(dedup) RawLvlsDict = structAnalyzer.extend_top(top, dedup) print(MU.json_convert(RawLvlsDict, pretty=True)) return RawLvlsDict def chunkRun(RawLvlsDict=None, RawDataDict=None): StructsDict = chunkBuilder.build(RawLvlsDict, RawDataDict) return StructsDict def SegmentRun(StructsDict, RawLvlsDict): first_key = list(RawLvlsDict[0].keys())[0] SegmentDict = [] for item in StructsDict: value = item.get(first_key) if not value: continue if isinstance(value, list): value = " ".join(v.strip() for v in value if isinstance(v, str) and v.strip().lower() != "none") if value.strip(): SegmentDict.append(item) for i, item in enumerate(SegmentDict, start=1): item["Index"] = i return SegmentDict def schemaRun(SegmentDict): SchemaDict = schemaExt.schemaRun(SegmentDict=SegmentDict) print(SchemaDict) return SchemaDict def Indexing(SchemaDict): Mapping, MapData = faissIndexer.build_from_json( SegmentPath=SegmentPath, SchemaDict=SchemaDict, FaissPath=FaissPath, MapDataPath=MapDataPath, MappingPath=MappingPath, MapChunkPath=MapChunkPath ) return Mapping, MapData mode = "json" def Prepare(): if mode == "pdf": print("\nLoading File...") pdf_doc = fitz.open(PdfPath) checker = QualityCheck.PDFQualityChecker() is_good, info = checker.evaluate(pdf_doc) print(info["status"]) if not is_good: print("⚠️ Bỏ qua file này.") return None, None, None, None else: print("✅ Tiếp tục xử lý.") print("\nExtracting...") RawDataDict = extractRun(pdf_doc) MU.write_json(RawDataDict, RawDataPath, indent=1) pdf_doc.close() print("\nGetting Struct...") RawLvlsDict = structRun(RawDataDict) MU.write_json(RawLvlsDict, RawLvlsPath, indent=2) print("\nChunking...") StructsDict = chunkRun(RawLvlsDict, RawDataDict) MU.write_json(StructsDict, StructsPath, indent=2) print("\nSegmenting...") SegmentDict = SegmentRun(StructsDict, RawLvlsDict) MU.write_json(SegmentDict, SegmentPath, indent=2) else: SegmentDict = MU.read_json(SegmentPath) print("\nCreating Schema...") SchemaDict = schemaRun(SegmentDict) MU.write_json(SchemaDict, SchemaPath, indent=2) print("\nEmbedding...") Mapping, MapData = Indexing(SchemaDict) MU.write_json(Mapping, MappingPath, indent=2) MU.write_json(MapData, MapDataPath, indent=2) print("\nCompleted!") return SegmentDict, SchemaDict, Mapping, MapData SegmentDict, SchemaDict, Mapping, MapData = Prepare()