Spaces:
Sleeping
Sleeping
| import fitz | |
| from Config import Configs | |
| from Config import ModelLoader as ML | |
| from Libraries import Common_MyUtils as MU | |
| from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck | |
| from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt | |
| from Libraries import Faiss_Embedding as F_Embedding | |
| Checkpoint = "vinai/bartpho-syllable" | |
| service = "Categories" | |
| inputs = "Categories.json" | |
| JsonKey = "paragraphs" | |
| JsonField = "Text" | |
| config = Configs.ConfigValues(service=service, inputs=inputs) | |
| inputPath = config["inputPath"] | |
| PdfPath = config["PdfPath"] | |
| DocPath = config["DocPath"] | |
| exceptPath = config["exceptPath"] | |
| markerPath = config["markerPath"] | |
| statusPath = config["statusPath"] | |
| RawDataPath = config["RawDataPath"] | |
| RawLvlsPath = config["RawLvlsPath"] | |
| StructsPath = config["StructsPath"] | |
| SegmentPath = config["SegmentPath"] | |
| SchemaPath = config["SchemaPath"] | |
| FaissPath = config["FaissPath"] | |
| MappingPath = config["MappingPath"] | |
| MapDataPath = config["MapDataPath"] | |
| MapChunkPath = config["MapChunkPath"] | |
| MetaPath = config["MetaPath"] | |
| DATA_KEY = config["DATA_KEY"] | |
| EMBE_KEY = config["EMBE_KEY"] | |
| SEARCH_EGINE = config["SEARCH_EGINE"] | |
| RERANK_MODEL = config["RERANK_MODEL"] | |
| RESPON_MODEL = config["RESPON_MODEL"] | |
| EMBEDD_MODEL = config["EMBEDD_MODEL"] | |
| CHUNKS_MODEL = config["CHUNKS_MODEL"] | |
| SUMARY_MODEL = config["SUMARY_MODEL"] | |
| WORD_LIMIT = config["WORD_LIMIT"] | |
| MODEL_DIR = "Models" | |
| MODEL_ENCODE = "Sentence_Transformer" | |
| MODEL_SUMARY = "Summarizer" | |
| EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}" | |
| CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}" | |
| SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}" | |
| MAX_INPUT = 1024 | |
| MAX_TARGET = 256 | |
| MIN_TARGET = 64 | |
| TRAIN_EPOCHS = 3 | |
| LEARNING_RATE = 3e-5 | |
| WEIGHT_DECAY = 0.01 | |
| BATCH_SIZE = 4 | |
| def loadHardcodes(file_path, wanted=None): | |
| data = MU.read_json(file_path) | |
| if "items" not in data: | |
| return | |
| result = {} | |
| for item in data["items"]: | |
| key = item["key"] | |
| if (not wanted) or (key in wanted): | |
| result[key] = item["values"] | |
| return result | |
| exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"]) | |
| markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"]) | |
| statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"]) | |
| Loader = ML.ModelLoader() | |
| indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL) | |
| chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL) | |
| dataExtractor = ExtractData.B1Extractor( | |
| exceptData, | |
| markerData, | |
| statusData, | |
| proper_name_min_count=10 | |
| ) | |
| structAnalyzer = GetStructures.StructureAnalyzer( | |
| verbose=True | |
| ) | |
| chunkBuilder = ChunkMaster.ChunkBuilder() | |
| schemaExt = SchemaExt.JSONSchemaExtractor( | |
| list_policy="first", | |
| verbose=True | |
| ) | |
| faissIndexer = F_Embedding.DirectFaissIndexer( | |
| indexer=indexer, | |
| device=str(embeddDevice), | |
| batch_size=32, | |
| show_progress=True, | |
| flatten_mode="split", | |
| join_sep="\n", | |
| allowed_schema_types=("string", "array", "dict"), | |
| max_chars_per_text=2000, | |
| normalize=True, | |
| verbose=False | |
| ) | |
| def extractRun(pdf_doc): | |
| extractedData = dataExtractor.extract(pdf_doc) | |
| RawDataDict = MergeData.mergeLinesToParagraphs(extractedData) | |
| return RawDataDict | |
| def structRun(RawDataDict): | |
| markers = structAnalyzer.extract_markers(RawDataDict) | |
| structures = structAnalyzer.build_structures(markers) | |
| dedup = structAnalyzer.deduplicate(structures) | |
| top = structAnalyzer.select_top(dedup) | |
| RawLvlsDict = structAnalyzer.extend_top(top, dedup) | |
| print(MU.json_convert(RawLvlsDict, pretty=True)) | |
| return RawLvlsDict | |
| def chunkRun(RawLvlsDict=None, RawDataDict=None): | |
| StructsDict = chunkBuilder.build(RawLvlsDict, RawDataDict) | |
| return StructsDict | |
| def SegmentRun(StructsDict, RawLvlsDict): | |
| first_key = list(RawLvlsDict[0].keys())[0] | |
| SegmentDict = [] | |
| for item in StructsDict: | |
| value = item.get(first_key) | |
| if not value: continue | |
| if isinstance(value, list): | |
| value = " ".join(v.strip() for v in value if isinstance(v, str) and v.strip().lower() != "none") | |
| if value.strip(): | |
| SegmentDict.append(item) | |
| for i, item in enumerate(SegmentDict, start=1): | |
| item["Index"] = i | |
| return SegmentDict | |
| def schemaRun(SegmentDict): | |
| SchemaDict = schemaExt.schemaRun(SegmentDict=SegmentDict) | |
| print(SchemaDict) | |
| return SchemaDict | |
| def Indexing(SchemaDict): | |
| Mapping, MapData = faissIndexer.build_from_json( | |
| SegmentPath=SegmentPath, | |
| SchemaDict=SchemaDict, | |
| FaissPath=FaissPath, | |
| MapDataPath=MapDataPath, | |
| MappingPath=MappingPath, | |
| MapChunkPath=MapChunkPath | |
| ) | |
| return Mapping, MapData | |
| mode = "json" | |
| def Prepare(): | |
| if mode == "pdf": | |
| print("\nLoading File...") | |
| pdf_doc = fitz.open(PdfPath) | |
| checker = QualityCheck.PDFQualityChecker() | |
| is_good, info = checker.evaluate(pdf_doc) | |
| print(info["status"]) | |
| if not is_good: | |
| print("⚠️ Bỏ qua file này.") | |
| return None, None, None, None | |
| else: | |
| print("✅ Tiếp tục xử lý.") | |
| print("\nExtracting...") | |
| RawDataDict = extractRun(pdf_doc) | |
| MU.write_json(RawDataDict, RawDataPath, indent=1) | |
| pdf_doc.close() | |
| print("\nGetting Struct...") | |
| RawLvlsDict = structRun(RawDataDict) | |
| MU.write_json(RawLvlsDict, RawLvlsPath, indent=2) | |
| print("\nChunking...") | |
| StructsDict = chunkRun(RawLvlsDict, RawDataDict) | |
| MU.write_json(StructsDict, StructsPath, indent=2) | |
| print("\nSegmenting...") | |
| SegmentDict = SegmentRun(StructsDict, RawLvlsDict) | |
| MU.write_json(SegmentDict, SegmentPath, indent=2) | |
| else: | |
| SegmentDict = MU.read_json(SegmentPath) | |
| print("\nCreating Schema...") | |
| SchemaDict = schemaRun(SegmentDict) | |
| MU.write_json(SchemaDict, SchemaPath, indent=2) | |
| print("\nEmbedding...") | |
| Mapping, MapData = Indexing(SchemaDict) | |
| MU.write_json(Mapping, MappingPath, indent=2) | |
| MU.write_json(MapData, MapDataPath, indent=2) | |
| print("\nCompleted!") | |
| return SegmentDict, SchemaDict, Mapping, MapData | |
| SegmentDict, SchemaDict, Mapping, MapData = Prepare() |