doc-ai-api / App_Caller.py
LongK171's picture
Add all
dbe2c62
import faiss
import fitz
from sentence_transformers import CrossEncoder
from Config import Configs
from Config import ModelLoader as ML
from Libraries import Common_MyUtils as MU, Common_TextProcess as TP
from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck
from Libraries import Json_ChunkUnder as ChunkUnder
from Libraries import Faiss_Searching as F_Searching, Faiss_ChunkMapping as ChunkMapper
from Libraries import Summarizer_Runner as SummaryRun
Checkpoint = "vinai/bartpho-syllable"
service = "Categories"
inputs = "BAD.pdf"
JsonKey = "paragraphs"
JsonField = "Text"
config = Configs.ConfigValues(service=service, inputs=inputs)
inputPath = config["inputPath"]
PdfPath = config["PdfPath"]
DocPath = config["DocPath"]
exceptPath = config["exceptPath"]
markerPath = config["markerPath"]
statusPath = config["statusPath"]
RawDataPath = config["RawDataPath"]
RawLvlsPath = config["RawLvlsPath"]
StructsPath = config["StructsPath"]
SegmentPath = config["SegmentPath"]
SchemaPath = config["SchemaPath"]
FaissPath = config["FaissPath"]
MappingPath = config["MappingPath"]
MapDataPath = config["MapDataPath"]
MapChunkPath = config["MapChunkPath"]
MetaPath = config["MetaPath"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
CHUNKS_MODEL = config["CHUNKS_MODEL"]
SUMARY_MODEL = config["SUMARY_MODEL"]
WORD_LIMIT = config["WORD_LIMIT"]
MODEL_DIR = "Models"
MODEL_ENCODE = "Sentence_Transformer"
MODEL_SUMARY = "Summarizer"
EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"
MAX_INPUT = 1024
MAX_TARGET = 256
MIN_TARGET = 64
TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 4
def loadHardcodes(file_path, wanted=None):
data = MU.read_json(file_path)
if "items" not in data:
return
result = {}
for item in data["items"]:
key = item["key"]
if (not wanted) or (key in wanted):
result[key] = item["values"]
return result
exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])
Loader = ML.ModelLoader()
indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)
tokenizer, summarizer, summaryDevice = Loader.load_summarizer(SUMARY_MODEL, SUMARY_CACHED_MODEL)
def runPrepareData():
SegmentDict = MU.read_json(SegmentPath)
Mapping = MU.read_json(MappingPath)
MapData = MU.read_json(MapDataPath)
MapChunk = MU.read_json(MapChunkPath)
faissIndex = faiss.read_index(FaissPath)
return SegmentDict, Mapping, MapData, MapChunk, faissIndex
SegmentDict, Mapping, MapData, MapChunk, faissIndex = runPrepareData()
dataExtractor = ExtractData.B1Extractor(
exceptData,
markerData,
statusData,
proper_name_min_count=10
)
chunkUnder = ChunkUnder.ChunkUndertheseaBuilder(
embedder=indexer,
device=embeddDevice,
min_words=256,
max_words=768,
sim_threshold=0.7,
key_sent_ratio=0.4
)
summarizer_engine = SummaryRun.RecursiveSummarizer(
tokenizer=tokenizer,
summarizer=summarizer,
sum_device=summaryDevice,
chunk_builder=chunkUnder,
max_length=200,
min_length=100,
max_depth=4
)
reranker = CrossEncoder(RERANK_MODEL, device=str(embeddDevice))
searchEngine = F_Searching.SemanticSearchEngine(
indexer=indexer,
reranker=reranker,
device=str(embeddDevice),
normalize=True,
top_k=20,
rerank_k=10,
rerank_batch_size=16
)
def extractRun(pdf_doc):
extractedData = dataExtractor.extract(pdf_doc)
RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
return RawDataDict
def runSearch(query):
results = searchEngine.search(
query=query,
faissIndex=faissIndex,
Mapping=Mapping,
MapData=MapData,
MapChunk=MapChunk,
top_k=20
)
return results
def runRerank(query, results):
reranked = searchEngine.rerank(
query=query,
results=results,
top_k=10
)
return reranked
def fileProcess(pdf_bytes):
"""Nhận file PDF bytes, thực hiện pipeline chính."""
pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
checker = QualityCheck.PDFQualityChecker()
is_good, metrics = checker.evaluate(pdf_doc)
print(metrics)
if not is_good:
print("⚠️ Bỏ qua file này.")
check_status = 0
summaryText = metrics["check_mess"]
bestArticle = ""
reranked = ""
else:
print("✅ Tiếp tục xử lý.")
check_status = 1,
RawDataDict = extractRun(pdf_doc)
full_text = TP.merge_txt(RawDataDict, JsonKey, JsonField)
summarized = summarizer_engine.summarize(full_text, minInput = 256, maxInput = 1024)
summaryText = summarized["summary_text"]
resuls = runSearch(summaryText)
reranked = runRerank(summaryText, resuls)
chunkReturn = ChunkMapper.process_chunks_pipeline(
reranked_results=reranked,
SegmentDict=SegmentDict,
drop_fields=["Index"],
fields=["Article"],
n_chunks=1,
)
bestArticles = [item["fields"].get("Article") for item in chunkReturn["extracted_fields"]]
bestArticle = bestArticles[0] if len(bestArticles) == 1 else ", ".join(bestArticles)
pdf_doc.close()
return {
"checkstatus": check_status,
"metrics": metrics,
"summary": summaryText,
"category": bestArticle,
"reranked": reranked[:5] if reranked else []
}