Spaces:
Sleeping
Sleeping
| import faiss | |
| import fitz | |
| from sentence_transformers import CrossEncoder | |
| from Config import Configs | |
| from Config import ModelLoader as ML | |
| from Libraries import Common_MyUtils as MU, Common_TextProcess as TP | |
| from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck | |
| from Libraries import Json_ChunkUnder as ChunkUnder | |
| from Libraries import Faiss_Searching as F_Searching, Faiss_ChunkMapping as ChunkMapper | |
| from Libraries import Summarizer_Runner as SummaryRun | |
| Checkpoint = "vinai/bartpho-syllable" | |
| service = "Categories" | |
| inputs = "BAD.pdf" | |
| JsonKey = "paragraphs" | |
| JsonField = "Text" | |
| config = Configs.ConfigValues(service=service, inputs=inputs) | |
| inputPath = config["inputPath"] | |
| PdfPath = config["PdfPath"] | |
| DocPath = config["DocPath"] | |
| exceptPath = config["exceptPath"] | |
| markerPath = config["markerPath"] | |
| statusPath = config["statusPath"] | |
| RawDataPath = config["RawDataPath"] | |
| RawLvlsPath = config["RawLvlsPath"] | |
| StructsPath = config["StructsPath"] | |
| SegmentPath = config["SegmentPath"] | |
| SchemaPath = config["SchemaPath"] | |
| FaissPath = config["FaissPath"] | |
| MappingPath = config["MappingPath"] | |
| MapDataPath = config["MapDataPath"] | |
| MapChunkPath = config["MapChunkPath"] | |
| MetaPath = config["MetaPath"] | |
| DATA_KEY = config["DATA_KEY"] | |
| EMBE_KEY = config["EMBE_KEY"] | |
| SEARCH_EGINE = config["SEARCH_EGINE"] | |
| RERANK_MODEL = config["RERANK_MODEL"] | |
| RESPON_MODEL = config["RESPON_MODEL"] | |
| EMBEDD_MODEL = config["EMBEDD_MODEL"] | |
| CHUNKS_MODEL = config["CHUNKS_MODEL"] | |
| SUMARY_MODEL = config["SUMARY_MODEL"] | |
| WORD_LIMIT = config["WORD_LIMIT"] | |
| MODEL_DIR = "Models" | |
| MODEL_ENCODE = "Sentence_Transformer" | |
| MODEL_SUMARY = "Summarizer" | |
| EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}" | |
| CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}" | |
| SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}" | |
| MAX_INPUT = 1024 | |
| MAX_TARGET = 256 | |
| MIN_TARGET = 64 | |
| TRAIN_EPOCHS = 3 | |
| LEARNING_RATE = 3e-5 | |
| WEIGHT_DECAY = 0.01 | |
| BATCH_SIZE = 4 | |
| def loadHardcodes(file_path, wanted=None): | |
| data = MU.read_json(file_path) | |
| if "items" not in data: | |
| return | |
| result = {} | |
| for item in data["items"]: | |
| key = item["key"] | |
| if (not wanted) or (key in wanted): | |
| result[key] = item["values"] | |
| return result | |
| exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"]) | |
| markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"]) | |
| statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"]) | |
| Loader = ML.ModelLoader() | |
| indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL) | |
| chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL) | |
| tokenizer, summarizer, summaryDevice = Loader.load_summarizer(SUMARY_MODEL, SUMARY_CACHED_MODEL) | |
| def runPrepareData(): | |
| SegmentDict = MU.read_json(SegmentPath) | |
| Mapping = MU.read_json(MappingPath) | |
| MapData = MU.read_json(MapDataPath) | |
| MapChunk = MU.read_json(MapChunkPath) | |
| faissIndex = faiss.read_index(FaissPath) | |
| return SegmentDict, Mapping, MapData, MapChunk, faissIndex | |
| SegmentDict, Mapping, MapData, MapChunk, faissIndex = runPrepareData() | |
| dataExtractor = ExtractData.B1Extractor( | |
| exceptData, | |
| markerData, | |
| statusData, | |
| proper_name_min_count=10 | |
| ) | |
| chunkUnder = ChunkUnder.ChunkUndertheseaBuilder( | |
| embedder=indexer, | |
| device=embeddDevice, | |
| min_words=256, | |
| max_words=768, | |
| sim_threshold=0.7, | |
| key_sent_ratio=0.4 | |
| ) | |
| summarizer_engine = SummaryRun.RecursiveSummarizer( | |
| tokenizer=tokenizer, | |
| summarizer=summarizer, | |
| sum_device=summaryDevice, | |
| chunk_builder=chunkUnder, | |
| max_length=200, | |
| min_length=100, | |
| max_depth=4 | |
| ) | |
| reranker = CrossEncoder(RERANK_MODEL, device=str(embeddDevice)) | |
| searchEngine = F_Searching.SemanticSearchEngine( | |
| indexer=indexer, | |
| reranker=reranker, | |
| device=str(embeddDevice), | |
| normalize=True, | |
| top_k=20, | |
| rerank_k=10, | |
| rerank_batch_size=16 | |
| ) | |
| def extractRun(pdf_doc): | |
| extractedData = dataExtractor.extract(pdf_doc) | |
| RawDataDict = MergeData.mergeLinesToParagraphs(extractedData) | |
| return RawDataDict | |
| def runSearch(query): | |
| results = searchEngine.search( | |
| query=query, | |
| faissIndex=faissIndex, | |
| Mapping=Mapping, | |
| MapData=MapData, | |
| MapChunk=MapChunk, | |
| top_k=20 | |
| ) | |
| return results | |
| def runRerank(query, results): | |
| reranked = searchEngine.rerank( | |
| query=query, | |
| results=results, | |
| top_k=10 | |
| ) | |
| return reranked | |
| def fileProcess(pdf_bytes): | |
| """Nhận file PDF bytes, thực hiện pipeline chính.""" | |
| pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| checker = QualityCheck.PDFQualityChecker() | |
| is_good, metrics = checker.evaluate(pdf_doc) | |
| print(metrics) | |
| if not is_good: | |
| print("⚠️ Bỏ qua file này.") | |
| check_status = 0 | |
| summaryText = metrics["check_mess"] | |
| bestArticle = "" | |
| reranked = "" | |
| else: | |
| print("✅ Tiếp tục xử lý.") | |
| check_status = 1, | |
| RawDataDict = extractRun(pdf_doc) | |
| full_text = TP.merge_txt(RawDataDict, JsonKey, JsonField) | |
| summarized = summarizer_engine.summarize(full_text, minInput = 256, maxInput = 1024) | |
| summaryText = summarized["summary_text"] | |
| resuls = runSearch(summaryText) | |
| reranked = runRerank(summaryText, resuls) | |
| chunkReturn = ChunkMapper.process_chunks_pipeline( | |
| reranked_results=reranked, | |
| SegmentDict=SegmentDict, | |
| drop_fields=["Index"], | |
| fields=["Article"], | |
| n_chunks=1, | |
| ) | |
| bestArticles = [item["fields"].get("Article") for item in chunkReturn["extracted_fields"]] | |
| bestArticle = bestArticles[0] if len(bestArticles) == 1 else ", ".join(bestArticles) | |
| pdf_doc.close() | |
| return { | |
| "checkstatus": check_status, | |
| "metrics": metrics, | |
| "summary": summaryText, | |
| "category": bestArticle, | |
| "reranked": reranked[:5] if reranked else [] | |
| } | |