Spaces:

Orias171
/

doc-ai-api

Sleeping

File size: 6,454 Bytes

dbe2c62

import fitz

from Config import Configs
from Config import ModelLoader as ML

from Libraries import Common_MyUtils as MU
from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck
from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt
from Libraries import Faiss_Embedding as F_Embedding

Checkpoint = "vinai/bartpho-syllable"
service = "Categories"
inputs = "Categories.json"
JsonKey = "paragraphs"
JsonField = "Text"

config = Configs.ConfigValues(service=service, inputs=inputs)
inputPath = config["inputPath"]
PdfPath = config["PdfPath"]
DocPath = config["DocPath"]
exceptPath = config["exceptPath"]
markerPath = config["markerPath"]
statusPath = config["statusPath"]
RawDataPath = config["RawDataPath"]
RawLvlsPath = config["RawLvlsPath"]
StructsPath = config["StructsPath"]
SegmentPath = config["SegmentPath"]
SchemaPath = config["SchemaPath"]
FaissPath = config["FaissPath"]
MappingPath = config["MappingPath"]
MapDataPath = config["MapDataPath"]
MapChunkPath = config["MapChunkPath"]
MetaPath = config["MetaPath"]
DATA_KEY = config["DATA_KEY"]
EMBE_KEY = config["EMBE_KEY"]
SEARCH_EGINE = config["SEARCH_EGINE"]
RERANK_MODEL = config["RERANK_MODEL"]
RESPON_MODEL = config["RESPON_MODEL"]
EMBEDD_MODEL = config["EMBEDD_MODEL"]
CHUNKS_MODEL = config["CHUNKS_MODEL"]
SUMARY_MODEL = config["SUMARY_MODEL"]
WORD_LIMIT = config["WORD_LIMIT"]

MODEL_DIR = "Models"
MODEL_ENCODE = "Sentence_Transformer"
MODEL_SUMARY = "Summarizer"
EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"

MAX_INPUT = 1024
MAX_TARGET = 256
MIN_TARGET = 64
TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WEIGHT_DECAY = 0.01
BATCH_SIZE = 4

def loadHardcodes(file_path, wanted=None):
    data = MU.read_json(file_path)
    if "items" not in data:
        return
    result = {}
    for item in data["items"]:
        key = item["key"]
        if (not wanted) or (key in wanted):
            result[key] = item["values"]
    return result

exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])

Loader = ML.ModelLoader()
indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)

dataExtractor = ExtractData.B1Extractor(
    exceptData,
    markerData,
    statusData,
    proper_name_min_count=10
)

structAnalyzer = GetStructures.StructureAnalyzer(
    verbose=True
)

chunkBuilder = ChunkMaster.ChunkBuilder()

schemaExt = SchemaExt.JSONSchemaExtractor(
    list_policy="first", 
    verbose=True
)

faissIndexer = F_Embedding.DirectFaissIndexer(
    indexer=indexer,
    device=str(embeddDevice),
    batch_size=32,
    show_progress=True,
    flatten_mode="split",
    join_sep="\n",
    allowed_schema_types=("string", "array", "dict"),
    max_chars_per_text=2000,
    normalize=True,
    verbose=False
)

def extractRun(pdf_doc):
    extractedData = dataExtractor.extract(pdf_doc)
    RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
    return RawDataDict

def structRun(RawDataDict):
    markers =       structAnalyzer.extract_markers(RawDataDict)
    structures =    structAnalyzer.build_structures(markers)
    dedup =         structAnalyzer.deduplicate(structures)
    top =           structAnalyzer.select_top(dedup)
    RawLvlsDict =   structAnalyzer.extend_top(top, dedup)
    print(MU.json_convert(RawLvlsDict, pretty=True))
    return RawLvlsDict

def chunkRun(RawLvlsDict=None, RawDataDict=None):
    StructsDict = chunkBuilder.build(RawLvlsDict, RawDataDict)
    return StructsDict

def SegmentRun(StructsDict, RawLvlsDict):
    first_key = list(RawLvlsDict[0].keys())[0]

    SegmentDict = []
    for item in StructsDict:
        value = item.get(first_key)
        if not value: continue
        
        if isinstance(value, list):
            value = " ".join(v.strip() for v in value if isinstance(v, str) and v.strip().lower() != "none")
            if value.strip():
                SegmentDict.append(item)

    for i, item in enumerate(SegmentDict, start=1):
        item["Index"] = i

    return SegmentDict

def schemaRun(SegmentDict):
    SchemaDict = schemaExt.schemaRun(SegmentDict=SegmentDict)
    print(SchemaDict)
    return SchemaDict

def Indexing(SchemaDict):
    Mapping, MapData = faissIndexer.build_from_json(
        SegmentPath=SegmentPath,
        SchemaDict=SchemaDict,
        FaissPath=FaissPath,
        MapDataPath=MapDataPath,
        MappingPath=MappingPath,
        MapChunkPath=MapChunkPath
    )
    return Mapping, MapData

mode = "json"

def Prepare():
    if mode == "pdf":
        print("\nLoading File...")
        pdf_doc = fitz.open(PdfPath)
        checker = QualityCheck.PDFQualityChecker()
        is_good, info = checker.evaluate(pdf_doc)
        print(info["status"])
        if not is_good:
            print("⚠️ Bỏ qua file này.")
            return None, None, None, None
        else:
            print("✅ Tiếp tục xử lý.")
        
        print("\nExtracting...")
        RawDataDict = extractRun(pdf_doc)
        MU.write_json(RawDataDict, RawDataPath, indent=1)
        pdf_doc.close()

        print("\nGetting Struct...")
        RawLvlsDict = structRun(RawDataDict)
        MU.write_json(RawLvlsDict, RawLvlsPath, indent=2)

        print("\nChunking...")
        StructsDict = chunkRun(RawLvlsDict, RawDataDict)
        MU.write_json(StructsDict, StructsPath, indent=2)

        print("\nSegmenting...")
        SegmentDict = SegmentRun(StructsDict, RawLvlsDict)
        MU.write_json(SegmentDict, SegmentPath, indent=2)
    else:
        SegmentDict = MU.read_json(SegmentPath)
    print("\nCreating Schema...")
    SchemaDict = schemaRun(SegmentDict)
    MU.write_json(SchemaDict, SchemaPath, indent=2)

    print("\nEmbedding...")
    Mapping, MapData = Indexing(SchemaDict)
    MU.write_json(Mapping, MappingPath, indent=2)
    MU.write_json(MapData, MapDataPath, indent=2)

    print("\nCompleted!")
    
    return SegmentDict, SchemaDict, Mapping, MapData

SegmentDict, SchemaDict, Mapping, MapData = Prepare()