Spaces:

Orias171
/

doc-ai-api

Sleeping

App Files Files Community

LongK171 commited on Oct 21

Commit

dbe2c62

1 Parent(s): 7a5cceb

Add all

Browse files

Files changed (43) hide show

.gitignore +21 -0
.vscode/settings.json +5 -0
AppGenerator.py +207 -0
App_Caller.py +194 -0
App_Run.py +34 -0
Assets/ex.exceptions.json +67 -0
Assets/ex.markers.json +122 -0
Assets/ex.status.json +20 -0
Config/APIs.json +11 -0
Config/Config.json +0 -0
Config/Configs.py +89 -0
Config/ModelLoader.py +280 -0
Database/Categories/Categories_Chunks_Schema.json +8 -0
Database/Categories/Categories_Chunks_Segment.json +313 -0
Database/Categories/Categories_Embedding_Index.faiss +3 -0
Database/Categories/Categories_Embedding_MapChunk.json +175 -0
Database/Categories/Categories_Embedding_MapData.json +845 -0
Database/Categories/Categories_Embedding_Mapping.json +177 -0
Demo/Assets/script.js +167 -0
Demo/Assets/style.css +279 -0
Demo/index.html +63 -0
Dockerfile +39 -0
Environment/bruh.yml +108 -0
Environment/env.yml +83 -0
Libraries/Common_MyUtils.py +273 -0
Libraries/Common_PdfProcess.py +152 -0
Libraries/Common_TextProcess.py +125 -0
Libraries/Faiss_ChunkMapping.py +184 -0
Libraries/Faiss_Embedding.py +288 -0
Libraries/Faiss_Searching.py +147 -0
Libraries/Json_ChunkMaster.py +91 -0
Libraries/Json_ChunkUnder.py +141 -0
Libraries/Json_GetStructures.py +223 -0
Libraries/Json_SchemaExt.py +155 -0
Libraries/PDF_ExtractData.py +605 -0
Libraries/PDF_MergeData.py +283 -0
Libraries/PDF_QualityCheck.py +106 -0
Libraries/Summarizer_Runner.py +162 -0
Libraries/Summarizer_Trainer.py +223 -0
README.md +27 -10
app.py +144 -0
requirements.txt +33 -0
start.sh +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,21 @@

+# Python
+__pycache__/
+*.py[cod]
+*.so
+*.pyd
+*.egg-info/
+.venv/
+venv/
+.env
+# Cache / data
+.cache/
+**/Models/
+**/outputs/
+**/checkpoints/
+# OS junk
+.DS_Store
+# Hugging Face
+*.bak

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}

AppGenerator.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import fitz
+from Config import Configs
+from Config import ModelLoader as ML
+from Libraries import Common_MyUtils as MU
+from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck
+from Libraries import Json_GetStructures as GetStructures, Json_ChunkMaster as ChunkMaster, Json_SchemaExt as SchemaExt
+from Libraries import Faiss_Embedding as F_Embedding
+Checkpoint = "vinai/bartpho-syllable"
+service = "Categories"
+inputs = "Categories.json"
+JsonKey = "paragraphs"
+JsonField = "Text"
+config = Configs.ConfigValues(service=service, inputs=inputs)
+inputPath = config["inputPath"]
+PdfPath = config["PdfPath"]
+DocPath = config["DocPath"]
+exceptPath = config["exceptPath"]
+markerPath = config["markerPath"]
+statusPath = config["statusPath"]
+RawDataPath = config["RawDataPath"]
+RawLvlsPath = config["RawLvlsPath"]
+StructsPath = config["StructsPath"]
+SegmentPath = config["SegmentPath"]
+SchemaPath = config["SchemaPath"]
+FaissPath = config["FaissPath"]
+MappingPath = config["MappingPath"]
+MapDataPath = config["MapDataPath"]
+MapChunkPath = config["MapChunkPath"]
+MetaPath = config["MetaPath"]
+DATA_KEY = config["DATA_KEY"]
+EMBE_KEY = config["EMBE_KEY"]
+SEARCH_EGINE = config["SEARCH_EGINE"]
+RERANK_MODEL = config["RERANK_MODEL"]
+RESPON_MODEL = config["RESPON_MODEL"]
+EMBEDD_MODEL = config["EMBEDD_MODEL"]
+CHUNKS_MODEL = config["CHUNKS_MODEL"]
+SUMARY_MODEL = config["SUMARY_MODEL"]
+WORD_LIMIT = config["WORD_LIMIT"]
+MODEL_DIR = "Models"
+MODEL_ENCODE = "Sentence_Transformer"
+MODEL_SUMARY = "Summarizer"
+EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
+CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
+SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"
+MAX_INPUT = 1024
+MAX_TARGET = 256
+MIN_TARGET = 64
+TRAIN_EPOCHS = 3
+LEARNING_RATE = 3e-5
+WEIGHT_DECAY = 0.01
+BATCH_SIZE = 4
+def loadHardcodes(file_path, wanted=None):
+    data = MU.read_json(file_path)
+    if "items" not in data:
+        return
+    result = {}
+    for item in data["items"]:
+        key = item["key"]
+        if (not wanted) or (key in wanted):
+            result[key] = item["values"]
+    return result
+exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
+markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
+statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])
+Loader = ML.ModelLoader()
+indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
+chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)
+dataExtractor = ExtractData.B1Extractor(
+    exceptData,
+    markerData,
+    statusData,
+    proper_name_min_count=10
+)
+structAnalyzer = GetStructures.StructureAnalyzer(
+    verbose=True
+)
+chunkBuilder = ChunkMaster.ChunkBuilder()
+schemaExt = SchemaExt.JSONSchemaExtractor(
+    list_policy="first",
+    verbose=True
+)
+faissIndexer = F_Embedding.DirectFaissIndexer(
+    indexer=indexer,
+    device=str(embeddDevice),
+    batch_size=32,
+    show_progress=True,
+    flatten_mode="split",
+    join_sep="\n",
+    allowed_schema_types=("string", "array", "dict"),
+    max_chars_per_text=2000,
+    normalize=True,
+    verbose=False
+)
+def extractRun(pdf_doc):
+    extractedData = dataExtractor.extract(pdf_doc)
+    RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
+    return RawDataDict
+def structRun(RawDataDict):
+    markers =       structAnalyzer.extract_markers(RawDataDict)
+    structures =    structAnalyzer.build_structures(markers)
+    dedup =         structAnalyzer.deduplicate(structures)
+    top =           structAnalyzer.select_top(dedup)
+    RawLvlsDict =   structAnalyzer.extend_top(top, dedup)
+    print(MU.json_convert(RawLvlsDict, pretty=True))
+    return RawLvlsDict
+def chunkRun(RawLvlsDict=None, RawDataDict=None):
+    StructsDict = chunkBuilder.build(RawLvlsDict, RawDataDict)
+    return StructsDict
+def SegmentRun(StructsDict, RawLvlsDict):
+    first_key = list(RawLvlsDict[0].keys())[0]
+    SegmentDict = []
+    for item in StructsDict:
+        value = item.get(first_key)
+        if not value: continue
+        if isinstance(value, list):
+            value = " ".join(v.strip() for v in value if isinstance(v, str) and v.strip().lower() != "none")
+            if value.strip():
+                SegmentDict.append(item)
+    for i, item in enumerate(SegmentDict, start=1):
+        item["Index"] = i
+    return SegmentDict
+def schemaRun(SegmentDict):
+    SchemaDict = schemaExt.schemaRun(SegmentDict=SegmentDict)
+    print(SchemaDict)
+    return SchemaDict
+def Indexing(SchemaDict):
+    Mapping, MapData = faissIndexer.build_from_json(
+        SegmentPath=SegmentPath,
+        SchemaDict=SchemaDict,
+        FaissPath=FaissPath,
+        MapDataPath=MapDataPath,
+        MappingPath=MappingPath,
+        MapChunkPath=MapChunkPath
+    )
+    return Mapping, MapData
+mode = "json"
+def Prepare():
+    if mode == "pdf":
+        print("\nLoading File...")
+        pdf_doc = fitz.open(PdfPath)
+        checker = QualityCheck.PDFQualityChecker()
+        is_good, info = checker.evaluate(pdf_doc)
+        print(info["status"])
+        if not is_good:
+            print("⚠️ Bỏ qua file này.")
+            return None, None, None, None
+        else:
+            print("✅ Tiếp tục xử lý.")
+        print("\nExtracting...")
+        RawDataDict = extractRun(pdf_doc)
+        MU.write_json(RawDataDict, RawDataPath, indent=1)
+        pdf_doc.close()
+        print("\nGetting Struct...")
+        RawLvlsDict = structRun(RawDataDict)
+        MU.write_json(RawLvlsDict, RawLvlsPath, indent=2)
+        print("\nChunking...")
+        StructsDict = chunkRun(RawLvlsDict, RawDataDict)
+        MU.write_json(StructsDict, StructsPath, indent=2)
+        print("\nSegmenting...")
+        SegmentDict = SegmentRun(StructsDict, RawLvlsDict)
+        MU.write_json(SegmentDict, SegmentPath, indent=2)
+    else:
+        SegmentDict = MU.read_json(SegmentPath)
+    print("\nCreating Schema...")
+    SchemaDict = schemaRun(SegmentDict)
+    MU.write_json(SchemaDict, SchemaPath, indent=2)
+    print("\nEmbedding...")
+    Mapping, MapData = Indexing(SchemaDict)
+    MU.write_json(Mapping, MappingPath, indent=2)
+    MU.write_json(MapData, MapDataPath, indent=2)
+    print("\nCompleted!")
+    return SegmentDict, SchemaDict, Mapping, MapData
+SegmentDict, SchemaDict, Mapping, MapData = Prepare()

App_Caller.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import faiss
+import fitz
+from sentence_transformers import CrossEncoder
+from Config import Configs
+from Config import ModelLoader as ML
+from Libraries import Common_MyUtils as MU, Common_TextProcess as TP
+from Libraries import PDF_ExtractData as ExtractData, PDF_MergeData as MergeData, PDF_QualityCheck as QualityCheck
+from Libraries import Json_ChunkUnder as ChunkUnder
+from Libraries import Faiss_Searching as F_Searching, Faiss_ChunkMapping as ChunkMapper
+from Libraries import Summarizer_Runner as SummaryRun
+Checkpoint = "vinai/bartpho-syllable"
+service = "Categories"
+inputs = "BAD.pdf"
+JsonKey = "paragraphs"
+JsonField = "Text"
+config = Configs.ConfigValues(service=service, inputs=inputs)
+inputPath = config["inputPath"]
+PdfPath = config["PdfPath"]
+DocPath = config["DocPath"]
+exceptPath = config["exceptPath"]
+markerPath = config["markerPath"]
+statusPath = config["statusPath"]
+RawDataPath = config["RawDataPath"]
+RawLvlsPath = config["RawLvlsPath"]
+StructsPath = config["StructsPath"]
+SegmentPath = config["SegmentPath"]
+SchemaPath = config["SchemaPath"]
+FaissPath = config["FaissPath"]
+MappingPath = config["MappingPath"]
+MapDataPath = config["MapDataPath"]
+MapChunkPath = config["MapChunkPath"]
+MetaPath = config["MetaPath"]
+DATA_KEY = config["DATA_KEY"]
+EMBE_KEY = config["EMBE_KEY"]
+SEARCH_EGINE = config["SEARCH_EGINE"]
+RERANK_MODEL = config["RERANK_MODEL"]
+RESPON_MODEL = config["RESPON_MODEL"]
+EMBEDD_MODEL = config["EMBEDD_MODEL"]
+CHUNKS_MODEL = config["CHUNKS_MODEL"]
+SUMARY_MODEL = config["SUMARY_MODEL"]
+WORD_LIMIT = config["WORD_LIMIT"]
+MODEL_DIR = "Models"
+MODEL_ENCODE = "Sentence_Transformer"
+MODEL_SUMARY = "Summarizer"
+EMBEDD_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_ENCODE}/{EMBEDD_MODEL}"
+CHUNKS_CACHED_MODEL = F"{MODEL_DIR}/{MODEL_ENCODE}/{CHUNKS_MODEL}"
+SUMARY_CACHED_MODEL = f"{MODEL_DIR}/{MODEL_SUMARY}/{SUMARY_MODEL}"
+MAX_INPUT = 1024
+MAX_TARGET = 256
+MIN_TARGET = 64
+TRAIN_EPOCHS = 3
+LEARNING_RATE = 3e-5
+WEIGHT_DECAY = 0.01
+BATCH_SIZE = 4
+def loadHardcodes(file_path, wanted=None):
+    data = MU.read_json(file_path)
+    if "items" not in data:
+        return
+    result = {}
+    for item in data["items"]:
+        key = item["key"]
+        if (not wanted) or (key in wanted):
+            result[key] = item["values"]
+    return result
+exceptData = loadHardcodes(exceptPath, wanted=["common_words", "proper_names", "abbreviations"])
+markerData = loadHardcodes(markerPath, wanted=["keywords", "markers"])
+statusData = loadHardcodes(statusPath, wanted=["brackets", "sentence_ends"])
+Loader = ML.ModelLoader()
+indexer, embeddDevice = Loader.load_encoder(EMBEDD_MODEL, EMBEDD_CACHED_MODEL)
+chunker, chunksDevice = Loader.load_encoder(CHUNKS_MODEL, CHUNKS_CACHED_MODEL)
+tokenizer, summarizer, summaryDevice = Loader.load_summarizer(SUMARY_MODEL, SUMARY_CACHED_MODEL)
+def runPrepareData():
+    SegmentDict = MU.read_json(SegmentPath)
+    Mapping = MU.read_json(MappingPath)
+    MapData = MU.read_json(MapDataPath)
+    MapChunk = MU.read_json(MapChunkPath)
+    faissIndex = faiss.read_index(FaissPath)
+    return SegmentDict, Mapping, MapData, MapChunk, faissIndex
+SegmentDict, Mapping, MapData, MapChunk, faissIndex = runPrepareData()
+dataExtractor = ExtractData.B1Extractor(
+    exceptData,
+    markerData,
+    statusData,
+    proper_name_min_count=10
+)
+chunkUnder = ChunkUnder.ChunkUndertheseaBuilder(
+    embedder=indexer,
+    device=embeddDevice,
+    min_words=256,
+    max_words=768,
+    sim_threshold=0.7,
+    key_sent_ratio=0.4
+)
+summarizer_engine = SummaryRun.RecursiveSummarizer(
+    tokenizer=tokenizer,
+    summarizer=summarizer,
+    sum_device=summaryDevice,
+    chunk_builder=chunkUnder,
+    max_length=200,
+    min_length=100,
+    max_depth=4
+)
+reranker = CrossEncoder(RERANK_MODEL, device=str(embeddDevice))
+searchEngine = F_Searching.SemanticSearchEngine(
+    indexer=indexer,
+    reranker=reranker,
+    device=str(embeddDevice),
+    normalize=True,
+    top_k=20,
+    rerank_k=10,
+    rerank_batch_size=16
+)
+def extractRun(pdf_doc):
+    extractedData = dataExtractor.extract(pdf_doc)
+    RawDataDict = MergeData.mergeLinesToParagraphs(extractedData)
+    return RawDataDict
+def runSearch(query):
+    results = searchEngine.search(
+        query=query,
+        faissIndex=faissIndex,
+        Mapping=Mapping,
+        MapData=MapData,
+        MapChunk=MapChunk,
+        top_k=20
+    )
+    return results
+def runRerank(query, results):
+    reranked = searchEngine.rerank(
+        query=query,
+        results=results,
+        top_k=10
+    )
+    return reranked
+def fileProcess(pdf_bytes):
+    """Nhận file PDF bytes, thực hiện pipeline chính."""
+    pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    checker = QualityCheck.PDFQualityChecker()
+    is_good, metrics = checker.evaluate(pdf_doc)
+    print(metrics)
+    if not is_good:
+        print("⚠️ Bỏ qua file này.")
+        check_status = 0
+        summaryText = metrics["check_mess"]
+        bestArticle = ""
+        reranked = ""
+    else:
+        print("✅ Tiếp tục xử lý.")
+        check_status = 1,
+        RawDataDict = extractRun(pdf_doc)
+        full_text = TP.merge_txt(RawDataDict, JsonKey, JsonField)
+        summarized = summarizer_engine.summarize(full_text, minInput = 256, maxInput = 1024)
+        summaryText = summarized["summary_text"]
+        resuls = runSearch(summaryText)
+        reranked = runRerank(summaryText, resuls)
+        chunkReturn = ChunkMapper.process_chunks_pipeline(
+            reranked_results=reranked,
+            SegmentDict=SegmentDict,
+            drop_fields=["Index"],
+            fields=["Article"],
+            n_chunks=1,
+        )
+        bestArticles = [item["fields"].get("Article") for item in chunkReturn["extracted_fields"]]
+        bestArticle = bestArticles[0] if len(bestArticles) == 1 else ", ".join(bestArticles)
+    pdf_doc.close()
+    return {
+        "checkstatus": check_status,
+        "metrics": metrics,
+        "summary": summaryText,
+        "category": bestArticle,
+        "reranked": reranked[:5] if reranked else []
+    }

App_Run.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from flask import Flask, request, jsonify
+from flask_cors import CORS
+import App_Caller
+app = Flask(__name__)
+CORS(app)
+@app.route("/process_pdf", methods=["POST"])
+def process_pdf():
+    """API nhận file PDF và trả về summary + category."""
+    if "file" not in request.files:
+        return jsonify({"error": "Thiếu file PDF"}), 400
+    pdf_file = request.files["file"]
+    if not pdf_file.filename.endswith(".pdf"):
+        return jsonify({"error": "File không hợp lệ"}), 400
+    try:
+        pdf_bytes = pdf_file.read()
+        result = App_Caller.fileProcess(pdf_bytes)
+        return jsonify({
+            "status": "success",
+            "checkstatus": result["checkstatus"],
+            "metrics": result["metrics"],
+            "summary": result["summary"],
+            "category": result["category"],
+            "top_candidates": result["reranked"]
+        })
+    except Exception as e:
+        return jsonify({"status": "error", "message": str(e)}), 500
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=8000)

Assets/ex.exceptions.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+  "type": "exceptions",
+  "items": [
+    {
+      "key": "common_words",
+      "values": [
+        "a",
+        "an",
+        "the",
+        "and",
+        "but",
+        "or",
+        "nor",
+        "for",
+        "so",
+        "yet",
+        "at",
+        "by",
+        "in",
+        "of",
+        "on",
+        "to",
+        "from",
+        "with",
+        "as",
+        "into",
+        "like",
+        "over",
+        "under",
+        "up",
+        "down",
+        "out",
+        "upon",
+        "onto",
+        "amid",
+        "among",
+        "between",
+        "before",
+        "after",
+        "against"
+      ]
+    },
+    {
+      "key": "proper_names",
+      "values": [
+        { "text": "HCM", "case_style": "upper" },
+        { "text": "ASEAN", "case_style": "upper" },
+        { "text": "UNESCO", "case_style": "upper" }
+      ]
+    },
+    {
+      "key": "abbreviations",
+      "values": [
+        { "text": "VN", "case_style": "upper" },
+        { "text": "TP.HCM", "case_style": "title" },
+        { "text": "ĐH", "case_style": "upper" },
+        { "text": "THPT", "case_style": "upper" },
+        { "text": "UBND", "case_style": "upper" },
+        { "text": "KT-XH", "case_style": "upper" },
+        { "text": "BĐS", "case_style": "upper" },
+        { "text": "QH", "case_style": "upper" },
+        { "text": "NN", "case_style": "upper" },
+        { "text": "XD", "case_style": "upper" }
+      ]
+    }
+  ]
+}

Assets/ex.markers.json ADDED Viewed

	@@ -0,0 +1,122 @@

+{
+  "type": "markers",
+  "items": [
+    {
+      "key": "keywords",
+      "values": [
+        "điều",
+        "khoản",
+        "mục",
+        "điểm",
+        "tiểu mục",
+        "chương",
+        "phần",
+        "chapter",
+        "section",
+        "article",
+        "part",
+        "clause",
+        "paragraph",
+        "item",
+        "point"
+      ]
+    },
+    {
+      "key": "markers",
+      "values": [
+        {
+          "pattern": "^([-+*•●◦○])\\s+",
+          "description": "Dấu đầu dòng (ví dụ: -, *, •, v.v.) theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:{keywords})([0-9]+|[IVX]+)([.\\)\\]:,;-])\\s+",
+          "description": "Từ khóa dính liền số hoặc số La Mã, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:{keywords})([0-9]+|[IVX]+)([.\\)\\]:,;-]?)$",
+          "description": "Từ khóa dính liền số hoặc số La Mã, không có ký tự nào sau marker",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:{keywords})\\s+([0-9]+|[a-z]|[A-Z]|[IVX]+)([.\\)\\]:,;-])\\s+",
+          "description": "Từ khóa theo sau bởi số, chữ cái (1 ký tự), hoặc số La Mã, với dấu kết thúc, sau đó là khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:{keywords})\\s+([0-9]+|[a-z]|[A-Z]|[IVX]+)([.\\)\\]:,;-]?)$",
+          "description": "Từ khóa theo sau bởi số, chữ cái (1 ký tự), hoặc số La Mã, với dấu kết thúc tùy chọn, không có ký tự nào sau marker",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:{keywords})\\s+([0-9]+(?:\\.[0-9]+)*|[a-z](?:\\.[a-z])*|[A-Z](?:\\.[A-Z])*|[IVX]+(?:\\.[IVX]+)*)([.\\)\\]:,;-])\\s+",
+          "description": "Từ khóa theo sau bởi số, chữ cái (1 ký tự), hoặc số La Mã dạng phân cấp, với dấu kết thúc, sau đó là khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:{keywords})\\s+([0-9]+(?:\\.[0-9]+)*|[a-z](?:\\.[a-z])*|[A-Z](?:\\.[A-Z])*|[IVX]+(?:\\.[IVX]+)*)([.\\)\\]:,;-]?)$",
+          "description": "Từ khóa theo sau bởi số, chữ cái (1 ký tự), hoặc số La Mã dạng phân cấp, với dấu kết thúc tùy chọn, không có ký tự nào sau marker",
+          "type": ""
+        },
+        {
+          "pattern": "^([0-9]+|[a-z]|[A-Z])([.\\)\\]:,;-])\\s+",
+          "description": "Số hoặc chữ cái (1 ký tự) với dấu kết thúc, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^(?:\\(([0-9]+|[a-z]|[A-Z]|[IVX]+)\\)|\"([0-9]+)\"|'([a-z])'|\\{([IVX]+)\\})\\s+",
+          "description": "Nhóm 'trong ngoặc' (tròn: số/chữ/La Mã; kép: số; đơn: chữ thường; nhọn: La Mã), theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^([0-9]+(?:\\.[0-9]+)*|[a-z](?:\\.[a-z])*|[A-Z](?:\\.[A-Z])*|[IVX]+(?:\\.[IVX]+)*)([.\\)\\]:,;-])\\s+",
+          "description": "Số/chữ/La Mã dạng phân cấp có dấu kết thúc, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^([0-9]+)\\s*-\\s*([0-9]+)\\s+",
+          "description": "Khoảng số (ví dụ: 1 - 2), theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^([A-Z]\\+|[IVX]+\\+|[0-9]+\\+)([.\\)\\]:,;-])\\s+",
+          "description": "Chữ hoa, La Mã hoặc số kèm dấu +, bắt buộc có dấu kết thúc, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^([đêôơưĐÊÔƠƯ])([.\\)\\]:,;-])\\s+",
+          "description": " == CHỮ CÁI TIẾNG VIỆT == với dấu kết thúc, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^\\(([đêôơưĐÊÔƠƯ])\\)\\s+",
+          "description": " == CHỮ CÁI TIẾNG VIỆT == trong ngoặc tròn, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^\"([đêôơưĐÊÔƠƯ])\"\\s+",
+          "description": " == CHỮ CÁI TIẾNG VIỆT == trong ngoặc kép, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^'([đêôơưĐÊÔƠƯ])'\\s+",
+          "description": " == CHỮ CÁI TIẾNG VIỆT == trong ngoặc đơn, theo sau bởi khoảng trắng",
+          "type": ""
+        },
+        {
+          "pattern": "^\\{([đêôơưĐÊÔƠƯ])\\}\\s+",
+          "description": " == CHỮ CÁI TIẾNG VIỆT == trong ngoặc nhọn, theo sau bởi khoảng trắng",
+          "type": ""
+        }
+      ]
+    },
+    {
+      "key": "notMakers",
+      "values": []
+    }
+  ]
+}

Assets/ex.status.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "type": "status",
+  "items": [
+    {
+      "key": "brackets",
+      "values": {
+        "open": "[\\(\\[\\{«“‘]",
+        "close": "[\\)\\]\\}»”’]",
+        "pairs": ["()", "[]", "{}", "«»", "“”", "‘’"]
+      }
+    },
+    {
+      "key": "sentence_ends",
+      "values": {
+        "punctuation": "[.!?:;]",
+        "valid_brackets": ["()", "[]", "{}"]
+      }
+    }
+  ]
+}

Config/APIs.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "APIs": [
+    "AIzaSyDaHS-8h6GJkyVPhoX4svvYeBTTVLNO-2w",
+    "AIzaSyD81vpriaNcvCyGOxy3TRR0w_njxgPJYfE",
+    "AIzaSyCsQo1gnYSLELV9flyPkYgHBdEvz7lqPjk",
+    "AIzaSyAJ7QFBJtozfyooguHAqsJsLO0a2L--tKo",
+    "AIzaSyBPjyMfHkS9OW3h7G0kmLSQkWQMfqfX5v0",
+    "AIzaSyA4HvCdIc4gGK4YCBlWS3vfXGjY3y9Zadg",
+    "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"
+  ]
+}

Config/Config.json ADDED Viewed

File without changes

Config/Configs.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import logging
+import os
+import faiss
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+os.environ["TORCH_USE_CUDA_DSA"] = "1"
+def ConfigValues(service="Search", inputs="file.pdf"):
+    # Inputs
+    inputFolder = f"./Private/Tests"
+    inputPath = f"{inputFolder}/{inputs}"
+    # Assets
+    assetsFolder = f"./Assets"
+    exceptPath = f"{assetsFolder}/ex.exceptions.json"
+    markerPath = f"{assetsFolder}/ex.markers.json"
+    statusPath = f"{assetsFolder}/ex.status.json"
+    # Documents
+    DocFolder = "./Documents"
+    DocPath = f"{DocFolder}/{service}"
+    PdfPath = f"{DocPath}.pdf"
+    DocPath = f"{DocPath}.docx"
+    # Database
+    DBFolder = "./Database"
+    DBPath = f"{DBFolder}/{service}/{service}"
+    RawExtractPath = f"{DBPath}_Extract"
+    ChunksPath = f"{DBPath}_Chunks"
+    EmbeddingPath = f"{DBPath}_Embedding"
+    RawDataPath = f"{RawExtractPath}_Raw.json"
+    RawLvlsPath = f"{RawExtractPath}_Levels.json"
+    StructsPath = f"{ChunksPath}_Struct.json"
+    SegmentPath = f"{ChunksPath}_Segment.json"
+    SchemaPath = f"{ChunksPath}_Schema.json"
+    FaissPath = f"{EmbeddingPath}_Index.faiss"
+    MappingPath = f"{EmbeddingPath}_Mapping.json"
+    MapDataPath = f"{EmbeddingPath}_MapData.json"
+    MapChunkPath = f"{EmbeddingPath}_MapChunk.json"
+    MetaPath = f"{EmbeddingPath}_Meta.json"
+    # Keys
+    DATA_KEY = "contents"
+    EMBE_KEY = "embeddings"
+    # Models
+    SEARCH_EGINE = faiss.IndexFlatIP
+    RERANK_MODEL = "BAAI/bge-reranker-base"
+    CHUNKS_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
+    EMBEDD_MODEL = "VoVanPhuc/sup-SimCSE-VietNamese-phobert-base"
+    RESPON_MODEL = "gpt-3.5-turbo"
+    SUMARY_MODEL = "vinai/bartpho-syllable"
+    WORD_LIMIT = 1000
+    return {
+        "inputPath": inputPath,
+        "PdfPath": PdfPath,
+        "DocPath": DocPath,
+        "exceptPath": exceptPath,
+        "markerPath": markerPath,
+        "statusPath": statusPath,
+        "RawDataPath": RawDataPath,
+        "RawLvlsPath": RawLvlsPath,
+        "StructsPath": StructsPath,
+        "SegmentPath": SegmentPath,
+        "SchemaPath": SchemaPath,
+        "FaissPath": FaissPath,
+        "MappingPath": MappingPath,
+        "MapDataPath": MapDataPath,
+        "MapChunkPath": MapChunkPath,
+        "MetaPath": MetaPath,
+        "DATA_KEY": DATA_KEY,
+        "EMBE_KEY": EMBE_KEY,
+        "SEARCH_EGINE": SEARCH_EGINE,
+        "RERANK_MODEL": RERANK_MODEL,
+        "RESPON_MODEL": RESPON_MODEL,
+        "CHUNKS_MODEL": CHUNKS_MODEL,
+        "EMBEDD_MODEL": EMBEDD_MODEL,
+        "SUMARY_MODEL": SUMARY_MODEL,
+        "WORD_LIMIT": WORD_LIMIT
+    }

Config/ModelLoader.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# ============================================================
+# Config/ModelLoader.py  —  Official, unified, complete
+# - Manage Encoder/Chunker (SentenceTransformer) and Summarizer (Seq2Seq)
+# - Auto-download to local cache when missing
+# - GPU/CPU selection with CUDA checks
+# - Consistent class-based API
+# ============================================================
+import os
+import torch
+from typing import List, Tuple, Optional, Dict, Any
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+class ModelLoader:
+    """
+    Unified model manager:
+      - Encoder (SentenceTransformer)
+      - Chunker (SentenceTransformer)
+      - Summarizer (Seq2Seq: T5/BART/vit5)
+    Provides:
+      - load_encoder(name, cache)
+      - load_chunker(name, cache)
+      - load_summarizer(name, cache)
+      - summarize(text, max_len, min_len)
+      - summarize_batch(texts, max_len, min_len)
+      - print_devices()
+    """
+    # -----------------------------
+    # Construction / State
+    # -----------------------------
+    def __init__(self, prefer_cuda: bool = True) -> None:
+        self.models: Dict[str, Any] = {}
+        self.tokenizers: Dict[str, Any] = {}
+        self.devices: Dict[str, torch.device] = {}
+        self.prefer_cuda = prefer_cuda
+    # -----------------------------
+    # Device helpers
+    # -----------------------------
+    @staticmethod
+    def _cuda_check() -> None:
+        print("CUDA supported:", torch.cuda.is_available())
+        print("Number of GPUs:", torch.cuda.device_count())
+        if torch.cuda.is_available():
+            print("Current GPU:", torch.cuda.get_device_name(0))
+            print("Capability:", torch.cuda.get_device_capability(0))
+            print("CUDA version (PyTorch):", torch.version.cuda)
+            print("cuDNN version:", torch.backends.cudnn.version())
+        else:
+            print("⚠️ CUDA not available, using CPU.")
+    def _get_device(self) -> torch.device:
+        if self.prefer_cuda and torch.cuda.is_available():
+            return torch.device("cuda")
+        return torch.device("cpu")
+    @staticmethod
+    def _ensure_dir(path: Optional[str]) -> None:
+        if path:
+            os.makedirs(path, exist_ok=True)
+    # -----------------------------
+    # SentenceTransformer (Encoder/Chunker)
+    # -----------------------------
+    @staticmethod
+    def _ensure_cached_sentence_model(model_name: str, cache_path: str) -> str:
+        """
+        Ensure SentenceTransformer exists under cache_path.
+        Rebuild structure if config missing.
+        """
+        if not os.path.exists(cache_path):
+            print(f"📥 Downloading SentenceTransformer to: {cache_path}")
+            model = SentenceTransformer(model_name)
+            model.save(cache_path)
+            print("✅ Cached SentenceTransformer successfully.")
+        else:
+            cfg = os.path.join(cache_path, "config_sentence_transformers.json")
+            if not os.path.exists(cfg):
+                print("⚙️ Rebuilding SentenceTransformer cache structure...")
+                tmp = SentenceTransformer(model_name)
+                tmp.save(cache_path)
+        return cache_path
+    def _load_sentence_model(self, model_name: str, cache_path: Optional[str]) -> Tuple[SentenceTransformer, torch.device]:
+        device = self._get_device()
+        print(f"\n🔍 Loading SentenceTransformer ({model_name}) on {device} ...")
+        self._cuda_check()
+        if cache_path:
+            self._ensure_dir(cache_path)
+            self._ensure_cached_sentence_model(model_name, cache_path)
+            model = SentenceTransformer(cache_path, device=str(device))
+            print(f"📂 Loaded from cache: {cache_path}")
+        else:
+            model = SentenceTransformer(model_name, device=str(device))
+        print("✅ SentenceTransformer ready.")
+        return model, device
+    # Public APIs for SentenceTransformer
+    def load_encoder(self, name: str, cache: Optional[str] = None) -> Tuple[SentenceTransformer, torch.device]:
+        model, device = self._load_sentence_model(name, cache)
+        self.models["encoder"] = model
+        self.devices["encoder"] = device
+        return model, device
+    def load_chunker(self, name: str, cache: Optional[str] = None) -> Tuple[SentenceTransformer, torch.device]:
+        model, device = self._load_sentence_model(name, cache)
+        self.models["chunker"] = model
+        self.devices["chunker"] = device
+        return model, device
+    # -----------------------------
+    # Summarizer (Seq2Seq: T5/BART/vit5)
+    # -----------------------------
+    @staticmethod
+    def _has_hf_config(cache_dir: str) -> bool:
+        return os.path.exists(os.path.join(cache_dir, "config.json"))
+    @staticmethod
+    def _download_and_cache_summarizer(model_name: str, cache_dir: str) -> None:
+        """
+        Download HF model + tokenizer and save_pretrained to cache_dir.
+        """
+        print("⚙️ Cache missing — downloading model from Hugging Face...")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        tokenizer.save_pretrained(cache_dir)
+        model.save_pretrained(cache_dir)
+        print(f"✅ Summarizer cached at: {cache_dir}")
+    def _load_summarizer_core(self, model_or_dir: str, device: torch.device) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM]:
+        tokenizer = AutoTokenizer.from_pretrained(model_or_dir)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_or_dir).to(device)
+        return tokenizer, model
+    def load_summarizer(self, name: str, cache: Optional[str] = None) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM, torch.device]:
+        """
+        Load Seq2Seq model; auto-download if cache dir missing or invalid.
+        """
+        device = self._get_device()
+        print(f"\n🔍 Initializing summarizer ({name}) on {device} ...")
+        self._cuda_check()
+        if cache:
+            self._ensure_dir(cache)
+            if not self._has_hf_config(cache):
+                self._download_and_cache_summarizer(name, cache)
+            print("📂 Loading summarizer from local cache...")
+            tok, mdl = self._load_summarizer_core(cache, device)
+        else:
+            print("🌐 Loading summarizer directly from Hugging Face (no cache dir provided)...")
+            tok, mdl = self._load_summarizer_core(name, device)
+        self.tokenizers["summarizer"] = tok
+        self.models["summarizer"] = mdl
+        self.devices["summarizer"] = device
+        print(f"✅ Summarizer ready on {device}")
+        return tok, mdl, device
+    # -----------------------------
+    # Summarization helpers
+    # -----------------------------
+    @staticmethod
+    def _apply_vietnews_prefix(text: str, prefix: str, suffix: str) -> str:
+        """
+        For VietAI/vit5-vietnews: prefix 'vietnews: ' and suffix ' </s>'
+        Safe for general T5-family; harmless for BART-family.
+        """
+        t = (text or "").strip()
+        if not t:
+            return ""
+        return f"{prefix}{t}{suffix}"
+    def summarize(self,
+                  text: str,
+                  max_len: int = 256,
+                  min_len: int = 64,
+                  prefix: str = "vietnews: ",
+                  suffix: str = " </s>") -> str:
+        """
+        Summarize a single text with loaded summarizer.
+        Raises RuntimeError if summarizer not loaded.
+        """
+        if "summarizer" not in self.models or "summarizer" not in self.tokenizers:
+            raise RuntimeError("❌ Summarizer not loaded. Call load_summarizer() first.")
+        model: AutoModelForSeq2SeqLM = self.models["summarizer"]
+        tokenizer: AutoTokenizer = self.tokenizers["summarizer"]
+        device: torch.device = self.devices["summarizer"]
+        prepared = self._apply_vietnews_prefix(text, prefix, suffix)
+        if not prepared:
+            return ""
+        encoding = tokenizer(
+            prepared,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024
+        ).to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **encoding,
+                max_length=max_len,
+                min_length=min_len,
+                num_beams=4,
+                no_repeat_ngram_size=3,
+                early_stopping=True
+            )
+        summary = tokenizer.decode(
+            outputs[0],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True
+        )
+        return summary
+    def summarize_batch(self,
+                        texts: List[str],
+                        max_len: int = 256,
+                        min_len: int = 64,
+                        prefix: str = "vietnews: ",
+                        suffix: str = " </s>") -> List[str]:
+        """
+        Batch summarization. Processes in a single forward pass when possible.
+        """
+        if "summarizer" not in self.models or "summarizer" not in self.tokenizers:
+            raise RuntimeError("❌ Summarizer not loaded. Call load_summarizer() first.")
+        model: AutoModelForSeq2SeqLM = self.models["summarizer"]
+        tokenizer: AutoTokenizer = self.tokenizers["summarizer"]
+        device: torch.device = self.devices["summarizer"]
+        batch = [self._apply_vietnews_prefix(t, prefix, suffix) for t in texts]
+        batch = [b for b in batch if b]  # drop empties
+        if not batch:
+            return []
+        encoding = tokenizer(
+            batch,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024,
+            padding=True
+        ).to(device)
+        summaries: List[str] = []
+        with torch.no_grad():
+            outputs = model.generate(
+                **encoding,
+                max_length=max_len,
+                min_length=min_len,
+                num_beams=4,
+                no_repeat_ngram_size=3,
+                early_stopping=True
+            )
+        for i in range(outputs.shape[0]):
+            dec = tokenizer.decode(
+                outputs[i],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True
+            )
+            summaries.append(dec)
+        return summaries
+    # -----------------------------
+    # Diagnostics
+    # -----------------------------
+    def print_devices(self) -> None:
+        print("\n📊 Device summary:")
+        for key, dev in self.devices.items():
+            print(f"  - {key}: {dev}")

Database/Categories/Categories_Chunks_Schema.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "Article": "string",
+  "Content": "object",
+  "Content.SubCategory": "object",
+  "Content.SubCategory.Core": "array",
+  "Content.SubCategory.Applied": "array",
+  "Content.SubCategory.Interdisciplinary": "array"
+}

Database/Categories/Categories_Chunks_Segment.json ADDED Viewed

	@@ -0,0 +1,313 @@

+[
+  {
+    "Article": "Toán Học",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Đại số tuyến tính",
+          "Giải tích vi phân - tích phân",
+          "Hình học phẳng và không gian",
+          "Tổ hợp và xác suất",
+          "Logic toán học"
+        ],
+        "Applied": [
+          "Toán rời rạc",
+          "Thống kê ứng dụng",
+          "Tối ưu hóa",
+          "Toán mô phỏng"
+        ],
+        "Interdisciplinary": [
+          "Toán học trong trí tuệ nhân tạo",
+          "Toán tài chính",
+          "Phân tích dữ liệu định lượng"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Tin Học",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Thuật toán và cấu trúc dữ liệu",
+          "Ngôn ngữ lập trình",
+          "Hệ điều hành",
+          "Cơ sở dữ liệu",
+          "Mạng máy tính"
+        ],
+        "Applied": [
+          "Phát triển phần mềm",
+          "Phân tích dữ liệu",
+          "Trí tuệ nhân tạo",
+          "Bảo mật thông tin"
+        ],
+        "Interdisciplinary": [
+          "Khoa học dữ liệu",
+          "Học máy và học sâu",
+          "Tin học y sinh",
+          "Thị giác máy tính"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Vật Lý",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Cơ học cổ điển",
+          "Điện học và từ học",
+          "Quang học",
+          "Nhiệt học",
+          "Dao động và sóng"
+        ],
+        "Applied": [
+          "Điện tử học cơ bản",
+          "Vật lý hạt nhân",
+          "Vật lý chất rắn",
+          "Vật lý thiên văn"
+        ],
+        "Interdisciplinary": [
+          "Vật lý lượng tử",
+          "Vật lý vật liệu",
+          "Khoa học năng lượng",
+          "Vật lý tính toán"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Hóa Học",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Hóa vô cơ",
+          "Hóa hữu cơ",
+          "Hóa lý",
+          "Hóa phân tích",
+          "Liên kết hóa học"
+        ],
+        "Applied": [
+          "Hóa học môi trường",
+          "Hóa học vật liệu",
+          "Hóa dược",
+          "Công nghệ hóa học"
+        ],
+        "Interdisciplinary": [
+          "Hóa sinh học",
+          "Hóa học tính toán",
+          "Hóa học năng lượng",
+          "Hóa học nano"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Công Nghệ",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Cơ khí chế tạo",
+          "Điện - Điện tử cơ bản",
+          "Tự động hóa",
+          "Kỹ thuật vật liệu",
+          "An toàn kỹ thuật"
+        ],
+        "Applied": [
+          "Robot và IoT",
+          "Công nghệ năng lượng tái tạo",
+          "Công nghệ môi trường",
+          "Công nghệ sinh học ứng dụng"
+        ],
+        "Interdisciplinary": [
+          "Công nghệ thực phẩm",
+          "Công nghệ nano",
+          "Kỹ thuật sản xuất thông minh",
+          "Công nghệ xanh và bền vững"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Sinh Học",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Tế bào học",
+          "Di truyền học",
+          "Sinh lý học",
+          "Sinh thái học",
+          "Tiến hóa học"
+        ],
+        "Applied": [
+          "Vi sinh vật học",
+          "Công nghệ sinh học",
+          "Sinh học phân tử",
+          "Sinh học phát triển"
+        ],
+        "Interdisciplinary": [
+          "Sinh học tính toán",
+          "Sinh học y học",
+          "Sinh học môi trường",
+          "Hệ gen học và tin sinh học"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Văn Học",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Lý luận văn học",
+          "Văn học Việt Nam",
+          "Văn học nước ngoài",
+          "Ngữ pháp và ngôn ngữ",
+          "Thể loại văn học"
+        ],
+        "Applied": [
+          "Phân tích và bình giảng tác phẩm",
+          "Làm văn - sáng tác",
+          "Phong cách học",
+          "Phê bình văn học"
+        ],
+        "Interdisciplinary": [
+          "Văn học so sánh",
+          "Văn hóa học",
+          "Ngôn ngữ học ứng dụng trong văn học",
+          "Tư duy phản biện văn học"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Địa Lý",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Địa lý tự nhiên",
+          "Địa lý kinh tế",
+          "Địa lý dân cư",
+          "Khí hậu học",
+          "Địa mạo học"
+        ],
+        "Applied": [
+          "Bản đồ học",
+          "Quản lý tài nguyên",
+          "Quy hoạch vùng và đô thị",
+          "Địa lý Việt Nam"
+        ],
+        "Interdisciplinary": [
+          "Địa lý môi trường",
+          "Địa lý chính trị",
+          "GIS và viễn thám",
+          "Địa lý phát triển bền vững"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Lịch Sử",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Lịch sử Việt Nam cổ đại",
+          "Lịch sử Việt Nam cận đại",
+          "Lịch sử thế giới cổ đại",
+          "Lịch sử thế giới hiện đại",
+          "Phương pháp sử học"
+        ],
+        "Applied": [
+          "Lịch sử chiến tranh và cách mạng",
+          "Văn hóa và xã hội qua các thời kỳ",
+          "Nhân vật lịch sử tiêu biểu",
+          "Nghiên cứu di sản"
+        ],
+        "Interdisciplinary": [
+          "Lịch sử tư tưởng - triết học",
+          "Lịch sử tôn giáo",
+          "Lịch sử khu vực (châu Á, Âu, Mỹ)",
+          "Lịch sử nghệ thuật"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Kinh Tế",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Kinh tế vi mô",
+          "Kinh tế vĩ mô",
+          "Thống kê kinh tế",
+          "Kinh tế lượng",
+          "Lý thuyết trò chơi"
+        ],
+        "Applied": [
+          "Tài chính - Ngân hàng",
+          "Kế toán - Kiểm toán",
+          "Marketing",
+          "Quản trị kinh doanh"
+        ],
+        "Interdisciplinary": [
+          "Kinh tế quốc tế",
+          "Kinh tế học hành vi",
+          "Kinh tế phát triển",
+          "Thương mại điện tử"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Chính Trị",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Chủ nghĩa Mác - Lênin",
+          "Tư tưởng Hồ Chí Minh",
+          "Hệ thống chính trị Việt Nam",
+          "Triết học chính trị",
+          "Lý luận nhà nước và pháp luật"
+        ],
+        "Applied": [
+          "Quan hệ quốc tế",
+          "Công tác Đảng và chính quyền",
+          "Chính sách công",
+          "Quản trị nhà nước"
+        ],
+        "Interdisciplinary": [
+          "Chính trị học so sánh",
+          "Kinh tế chính trị",
+          "Xã hội học chính trị",
+          "Truyền thông chính trị"
+        ]
+      }
+    }
+  },
+  {
+    "Article": "Ngoại Ngữ",
+    "Content": {
+      "SubCategory": {
+        "Core": [
+          "Ngữ pháp cơ bản",
+          "Từ vựng và ngữ nghĩa",
+          "Ngữ âm - Phát âm",
+          "Cấu trúc câu",
+          "Ngữ dụng học"
+        ],
+        "Applied": [
+          "Kỹ năng nghe",
+          "Kỹ năng nói",
+          "Kỹ năng đọc hiểu",
+          "Kỹ năng viết"
+        ],
+        "Interdisciplinary": [
+          "Biên - phiên dịch học",
+          "Ngôn ngữ học so sánh",
+          "Văn hóa và giao tiếp liên văn hóa",
+          "Ngôn ngữ học ứng dụng trong AI"
+        ]
+      }
+    }
+  }
+]

Database/Categories/Categories_Embedding_Index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eccb9e6695079ec697e9570526d9a44a34efb376ecd159f66d4448d61f9b49e0
+size 513069

Database/Categories/Categories_Embedding_MapChunk.json ADDED Viewed

	@@ -0,0 +1,175 @@

+{
+  "index_to_chunk": {
+    "0": [1],
+    "1": [1],
+    "2": [1],
+    "3": [1],
+    "4": [1],
+    "5": [1],
+    "6": [1],
+    "7": [1],
+    "8": [1],
+    "9": [1],
+    "10": [1],
+    "11": [1],
+    "12": [1],
+    "13": [2],
+    "14": [2],
+    "15": [2],
+    "16": [2],
+    "17": [2],
+    "18": [2],
+    "19": [2],
+    "20": [2],
+    "21": [2],
+    "22": [2],
+    "23": [2],
+    "24": [2],
+    "25": [2],
+    "26": [2],
+    "27": [3],
+    "28": [3],
+    "29": [3],
+    "30": [3],
+    "31": [3],
+    "32": [3],
+    "33": [3],
+    "34": [3],
+    "35": [3],
+    "36": [3],
+    "37": [3],
+    "38": [3],
+    "39": [3],
+    "40": [3],
+    "41": [4],
+    "42": [4],
+    "43": [4],
+    "44": [4],
+    "45": [4],
+    "46": [4],
+    "47": [4],
+    "48": [4],
+    "49": [4],
+    "50": [4],
+    "51": [4],
+    "52": [4],
+    "53": [4],
+    "54": [4],
+    "55": [5],
+    "56": [5],
+    "57": [5],
+    "58": [5],
+    "59": [5],
+    "60": [5],
+    "61": [5],
+    "62": [5],
+    "63": [5],
+    "64": [5],
+    "65": [5],
+    "66": [5],
+    "67": [5],
+    "68": [5],
+    "69": [6],
+    "70": [6],
+    "71": [6],
+    "72": [6],
+    "73": [6],
+    "74": [6],
+    "75": [6],
+    "76": [6],
+    "77": [6],
+    "78": [6],
+    "79": [6],
+    "80": [6],
+    "81": [6],
+    "82": [6],
+    "83": [7],
+    "84": [7],
+    "85": [7],
+    "86": [7],
+    "87": [7],
+    "88": [7],
+    "89": [7],
+    "90": [7],
+    "91": [7],
+    "92": [7],
+    "93": [7],
+    "94": [7],
+    "95": [7],
+    "96": [7],
+    "97": [8],
+    "98": [8],
+    "99": [8],
+    "100": [8],
+    "101": [8],
+    "102": [8],
+    "103": [8],
+    "104": [8],
+    "105": [8],
+    "106": [8],
+    "107": [8],
+    "108": [8],
+    "109": [8],
+    "110": [8],
+    "111": [9],
+    "112": [9],
+    "113": [9],
+    "114": [9],
+    "115": [9],
+    "116": [9],
+    "117": [9],
+    "118": [9],
+    "119": [9],
+    "120": [9],
+    "121": [9],
+    "122": [9],
+    "123": [9],
+    "124": [9],
+    "125": [10],
+    "126": [10],
+    "127": [10],
+    "128": [10],
+    "129": [10],
+    "130": [10],
+    "131": [10],
+    "132": [10],
+    "133": [10],
+    "134": [10],
+    "135": [10],
+    "136": [10],
+    "137": [10],
+    "138": [10],
+    "139": [11],
+    "140": [11],
+    "141": [11],
+    "142": [11],
+    "143": [11],
+    "144": [11],
+    "145": [11],
+    "146": [11],
+    "147": [11],
+    "148": [11],
+    "149": [11],
+    "150": [11],
+    "151": [11],
+    "152": [11],
+    "153": [12],
+    "154": [12],
+    "155": [12],
+    "156": [12],
+    "157": [12],
+    "158": [12],
+    "159": [12],
+    "160": [12],
+    "161": [12],
+    "162": [12],
+    "163": [12],
+    "164": [12],
+    "165": [12],
+    "166": [12]
+  },
+  "meta": {
+    "count": 167,
+    "source": "Categories_Chunks_Segment.json"
+  }
+}

Database/Categories/Categories_Embedding_MapData.json ADDED Viewed

	@@ -0,0 +1,845 @@

+{
+  "items": [
+    {
+      "index": 0,
+      "key": "Article",
+      "text": "Toán Học"
+    },
+    {
+      "index": 1,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Đại số tuyến tính"
+    },
+    {
+      "index": 2,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Giải tích vi phân - tích phân"
+    },
+    {
+      "index": 3,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Hình học phẳng và không gian"
+    },
+    {
+      "index": 4,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Tổ hợp và xác suất"
+    },
+    {
+      "index": 5,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Logic toán học"
+    },
+    {
+      "index": 6,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Toán rời rạc"
+    },
+    {
+      "index": 7,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Thống kê ứng dụng"
+    },
+    {
+      "index": 8,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Tối ưu hóa"
+    },
+    {
+      "index": 9,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Toán mô phỏng"
+    },
+    {
+      "index": 10,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Toán học trong trí tuệ nhân tạo"
+    },
+    {
+      "index": 11,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Toán tài chính"
+    },
+    {
+      "index": 12,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Phân tích dữ liệu định lượng"
+    },
+    {
+      "index": 13,
+      "key": "Article",
+      "text": "Tin Học"
+    },
+    {
+      "index": 14,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Thuật toán và cấu trúc dữ liệu"
+    },
+    {
+      "index": 15,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Ngôn ngữ lập trình"
+    },
+    {
+      "index": 16,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Hệ điều hành"
+    },
+    {
+      "index": 17,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Cơ sở dữ liệu"
+    },
+    {
+      "index": 18,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Mạng máy tính"
+    },
+    {
+      "index": 19,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Phát triển phần mềm"
+    },
+    {
+      "index": 20,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Phân tích dữ liệu"
+    },
+    {
+      "index": 21,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Trí tuệ nhân tạo"
+    },
+    {
+      "index": 22,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Bảo mật thông tin"
+    },
+    {
+      "index": 23,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Khoa học dữ liệu"
+    },
+    {
+      "index": 24,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Học máy và học sâu"
+    },
+    {
+      "index": 25,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Tin học y sinh"
+    },
+    {
+      "index": 26,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Thị giác máy tính"
+    },
+    {
+      "index": 27,
+      "key": "Article",
+      "text": "Vật Lý"
+    },
+    {
+      "index": 28,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Cơ học cổ điển"
+    },
+    {
+      "index": 29,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Điện học và từ học"
+    },
+    {
+      "index": 30,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Quang học"
+    },
+    {
+      "index": 31,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Nhiệt học"
+    },
+    {
+      "index": 32,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Dao động và sóng"
+    },
+    {
+      "index": 33,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Điện tử học cơ bản"
+    },
+    {
+      "index": 34,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Vật lý hạt nhân"
+    },
+    {
+      "index": 35,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Vật lý chất rắn"
+    },
+    {
+      "index": 36,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Vật lý thiên văn"
+    },
+    {
+      "index": 37,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Vật lý lượng tử"
+    },
+    {
+      "index": 38,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Vật lý vật liệu"
+    },
+    {
+      "index": 39,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Khoa học năng lượng"
+    },
+    {
+      "index": 40,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Vật lý tính toán"
+    },
+    {
+      "index": 41,
+      "key": "Article",
+      "text": "Hóa Học"
+    },
+    {
+      "index": 42,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Hóa vô cơ"
+    },
+    {
+      "index": 43,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Hóa hữu cơ"
+    },
+    {
+      "index": 44,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Hóa lý"
+    },
+    {
+      "index": 45,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Hóa phân tích"
+    },
+    {
+      "index": 46,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Liên kết hóa học"
+    },
+    {
+      "index": 47,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Hóa học môi trường"
+    },
+    {
+      "index": 48,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Hóa học vật liệu"
+    },
+    {
+      "index": 49,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Hóa dược"
+    },
+    {
+      "index": 50,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Công nghệ hóa học"
+    },
+    {
+      "index": 51,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Hóa sinh học"
+    },
+    {
+      "index": 52,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Hóa học tính toán"
+    },
+    {
+      "index": 53,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Hóa học năng lượng"
+    },
+    {
+      "index": 54,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Hóa học nano"
+    },
+    {
+      "index": 55,
+      "key": "Article",
+      "text": "Công Nghệ"
+    },
+    {
+      "index": 56,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Cơ khí chế tạo"
+    },
+    {
+      "index": 57,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Điện - Điện tử cơ bản"
+    },
+    {
+      "index": 58,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Tự động hóa"
+    },
+    {
+      "index": 59,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Kỹ thuật vật liệu"
+    },
+    {
+      "index": 60,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "An toàn kỹ thuật"
+    },
+    {
+      "index": 61,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Robot và IoT"
+    },
+    {
+      "index": 62,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Công nghệ năng lượng tái tạo"
+    },
+    {
+      "index": 63,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Công nghệ môi trường"
+    },
+    {
+      "index": 64,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Công nghệ sinh học ứng dụng"
+    },
+    {
+      "index": 65,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Công nghệ thực phẩm"
+    },
+    {
+      "index": 66,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Công nghệ nano"
+    },
+    {
+      "index": 67,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Kỹ thuật sản xuất thông minh"
+    },
+    {
+      "index": 68,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Công nghệ xanh và bền vững"
+    },
+    {
+      "index": 69,
+      "key": "Article",
+      "text": "Sinh Học"
+    },
+    {
+      "index": 70,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Tế bào học"
+    },
+    {
+      "index": 71,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Di truyền học"
+    },
+    {
+      "index": 72,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Sinh lý học"
+    },
+    {
+      "index": 73,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Sinh thái học"
+    },
+    {
+      "index": 74,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Tiến hóa học"
+    },
+    {
+      "index": 75,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Vi sinh vật học"
+    },
+    {
+      "index": 76,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Công nghệ sinh học"
+    },
+    {
+      "index": 77,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Sinh học phân tử"
+    },
+    {
+      "index": 78,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Sinh học phát triển"
+    },
+    {
+      "index": 79,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Sinh học tính toán"
+    },
+    {
+      "index": 80,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Sinh học y học"
+    },
+    {
+      "index": 81,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Sinh học môi trường"
+    },
+    {
+      "index": 82,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Hệ gen học và tin sinh học"
+    },
+    {
+      "index": 83,
+      "key": "Article",
+      "text": "Văn Học"
+    },
+    {
+      "index": 84,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Lý luận văn học"
+    },
+    {
+      "index": 85,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Văn học Việt Nam"
+    },
+    {
+      "index": 86,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Văn học nước ngoài"
+    },
+    {
+      "index": 87,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Ngữ pháp và ngôn ngữ"
+    },
+    {
+      "index": 88,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Thể loại văn học"
+    },
+    {
+      "index": 89,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Phân tích và bình giảng tác phẩm"
+    },
+    {
+      "index": 90,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Làm văn - sáng tác"
+    },
+    {
+      "index": 91,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Phong cách học"
+    },
+    {
+      "index": 92,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Phê bình văn học"
+    },
+    {
+      "index": 93,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Văn học so sánh"
+    },
+    {
+      "index": 94,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Văn hóa học"
+    },
+    {
+      "index": 95,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Ngôn ngữ học ứng dụng trong văn học"
+    },
+    {
+      "index": 96,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Tư duy phản biện văn học"
+    },
+    {
+      "index": 97,
+      "key": "Article",
+      "text": "Địa Lý"
+    },
+    {
+      "index": 98,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Địa lý tự nhiên"
+    },
+    {
+      "index": 99,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Địa lý kinh tế"
+    },
+    {
+      "index": 100,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Địa lý dân cư"
+    },
+    {
+      "index": 101,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Khí hậu học"
+    },
+    {
+      "index": 102,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Địa mạo học"
+    },
+    {
+      "index": 103,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Bản đồ học"
+    },
+    {
+      "index": 104,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Quản lý tài nguyên"
+    },
+    {
+      "index": 105,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Quy hoạch vùng và đô thị"
+    },
+    {
+      "index": 106,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Địa lý Việt Nam"
+    },
+    {
+      "index": 107,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Địa lý môi trường"
+    },
+    {
+      "index": 108,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Địa lý chính trị"
+    },
+    {
+      "index": 109,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "GIS và viễn thám"
+    },
+    {
+      "index": 110,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Địa lý phát triển bền vững"
+    },
+    {
+      "index": 111,
+      "key": "Article",
+      "text": "Lịch Sử"
+    },
+    {
+      "index": 112,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Lịch sử Việt Nam cổ đại"
+    },
+    {
+      "index": 113,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Lịch sử Việt Nam cận đại"
+    },
+    {
+      "index": 114,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Lịch sử thế giới cổ đại"
+    },
+    {
+      "index": 115,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Lịch sử thế giới hiện đại"
+    },
+    {
+      "index": 116,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Phương pháp sử học"
+    },
+    {
+      "index": 117,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Lịch sử chiến tranh và cách mạng"
+    },
+    {
+      "index": 118,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Văn hóa và xã hội qua các thời kỳ"
+    },
+    {
+      "index": 119,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Nhân vật lịch sử tiêu biểu"
+    },
+    {
+      "index": 120,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Nghiên cứu di sản"
+    },
+    {
+      "index": 121,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Lịch sử tư tưởng - triết học"
+    },
+    {
+      "index": 122,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Lịch sử tôn giáo"
+    },
+    {
+      "index": 123,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Lịch sử khu vực (châu Á, Âu, Mỹ)"
+    },
+    {
+      "index": 124,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Lịch sử nghệ thuật"
+    },
+    {
+      "index": 125,
+      "key": "Article",
+      "text": "Kinh Tế"
+    },
+    {
+      "index": 126,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Kinh tế vi mô"
+    },
+    {
+      "index": 127,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Kinh tế vĩ mô"
+    },
+    {
+      "index": 128,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Thống kê kinh tế"
+    },
+    {
+      "index": 129,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Kinh tế lượng"
+    },
+    {
+      "index": 130,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Lý thuyết trò chơi"
+    },
+    {
+      "index": 131,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Tài chính - Ngân hàng"
+    },
+    {
+      "index": 132,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Kế toán - Kiểm toán"
+    },
+    {
+      "index": 133,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Marketing"
+    },
+    {
+      "index": 134,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Quản trị kinh doanh"
+    },
+    {
+      "index": 135,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Kinh tế quốc tế"
+    },
+    {
+      "index": 136,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Kinh tế học hành vi"
+    },
+    {
+      "index": 137,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Kinh tế phát triển"
+    },
+    {
+      "index": 138,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Thương mại điện tử"
+    },
+    {
+      "index": 139,
+      "key": "Article",
+      "text": "Chính Trị"
+    },
+    {
+      "index": 140,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Chủ nghĩa Mác - Lênin"
+    },
+    {
+      "index": 141,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Tư tưởng Hồ Chí Minh"
+    },
+    {
+      "index": 142,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Hệ thống chính trị Việt Nam"
+    },
+    {
+      "index": 143,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Triết học chính trị"
+    },
+    {
+      "index": 144,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Lý luận nhà nước và pháp luật"
+    },
+    {
+      "index": 145,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Quan hệ quốc tế"
+    },
+    {
+      "index": 146,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Công tác Đảng và chính quyền"
+    },
+    {
+      "index": 147,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Chính sách công"
+    },
+    {
+      "index": 148,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Quản trị nhà nước"
+    },
+    {
+      "index": 149,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Chính trị học so sánh"
+    },
+    {
+      "index": 150,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Kinh tế chính trị"
+    },
+    {
+      "index": 151,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Xã hội học chính trị"
+    },
+    {
+      "index": 152,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Truyền thông chính trị"
+    },
+    {
+      "index": 153,
+      "key": "Article",
+      "text": "Ngoại Ngữ"
+    },
+    {
+      "index": 154,
+      "key": "Content.SubCategory.Core[0]",
+      "text": "Ngữ pháp cơ bản"
+    },
+    {
+      "index": 155,
+      "key": "Content.SubCategory.Core[1]",
+      "text": "Từ vựng và ngữ nghĩa"
+    },
+    {
+      "index": 156,
+      "key": "Content.SubCategory.Core[2]",
+      "text": "Ngữ âm - Phát âm"
+    },
+    {
+      "index": 157,
+      "key": "Content.SubCategory.Core[3]",
+      "text": "Cấu trúc câu"
+    },
+    {
+      "index": 158,
+      "key": "Content.SubCategory.Core[4]",
+      "text": "Ngữ dụng học"
+    },
+    {
+      "index": 159,
+      "key": "Content.SubCategory.Applied[0]",
+      "text": "Kỹ năng nghe"
+    },
+    {
+      "index": 160,
+      "key": "Content.SubCategory.Applied[1]",
+      "text": "Kỹ năng nói"
+    },
+    {
+      "index": 161,
+      "key": "Content.SubCategory.Applied[2]",
+      "text": "Kỹ năng đọc hiểu"
+    },
+    {
+      "index": 162,
+      "key": "Content.SubCategory.Applied[3]",
+      "text": "Kỹ năng viết"
+    },
+    {
+      "index": 163,
+      "key": "Content.SubCategory.Interdisciplinary[0]",
+      "text": "Biên - phiên dịch học"
+    },
+    {
+      "index": 164,
+      "key": "Content.SubCategory.Interdisciplinary[1]",
+      "text": "Ngôn ngữ học so sánh"
+    },
+    {
+      "index": 165,
+      "key": "Content.SubCategory.Interdisciplinary[2]",
+      "text": "Văn hóa và giao tiếp liên văn hóa"
+    },
+    {
+      "index": 166,
+      "key": "Content.SubCategory.Interdisciplinary[3]",
+      "text": "Ngôn ngữ học ứng dụng trong AI"
+    }
+  ],
+  "meta": {
+    "count": 167,
+    "flatten_mode": "split",
+    "schema_used": true,
+    "list_policy": "split"
+  }
+}

Database/Categories/Categories_Embedding_Mapping.json ADDED Viewed

	@@ -0,0 +1,177 @@

+{
+  "meta": {
+    "count": 167,
+    "dim": 768,
+    "metric": "ip",
+    "normalized": true
+  },
+  "index_to_key": {
+    "0": "Article",
+    "1": "Content.SubCategory.Core[0]",
+    "2": "Content.SubCategory.Core[1]",
+    "3": "Content.SubCategory.Core[2]",
+    "4": "Content.SubCategory.Core[3]",
+    "5": "Content.SubCategory.Core[4]",
+    "6": "Content.SubCategory.Applied[0]",
+    "7": "Content.SubCategory.Applied[1]",
+    "8": "Content.SubCategory.Applied[2]",
+    "9": "Content.SubCategory.Applied[3]",
+    "10": "Content.SubCategory.Interdisciplinary[0]",
+    "11": "Content.SubCategory.Interdisciplinary[1]",
+    "12": "Content.SubCategory.Interdisciplinary[2]",
+    "13": "Article",
+    "14": "Content.SubCategory.Core[0]",
+    "15": "Content.SubCategory.Core[1]",
+    "16": "Content.SubCategory.Core[2]",
+    "17": "Content.SubCategory.Core[3]",
+    "18": "Content.SubCategory.Core[4]",
+    "19": "Content.SubCategory.Applied[0]",
+    "20": "Content.SubCategory.Applied[1]",
+    "21": "Content.SubCategory.Applied[2]",
+    "22": "Content.SubCategory.Applied[3]",
+    "23": "Content.SubCategory.Interdisciplinary[0]",
+    "24": "Content.SubCategory.Interdisciplinary[1]",
+    "25": "Content.SubCategory.Interdisciplinary[2]",
+    "26": "Content.SubCategory.Interdisciplinary[3]",
+    "27": "Article",
+    "28": "Content.SubCategory.Core[0]",
+    "29": "Content.SubCategory.Core[1]",
+    "30": "Content.SubCategory.Core[2]",
+    "31": "Content.SubCategory.Core[3]",
+    "32": "Content.SubCategory.Core[4]",
+    "33": "Content.SubCategory.Applied[0]",
+    "34": "Content.SubCategory.Applied[1]",
+    "35": "Content.SubCategory.Applied[2]",
+    "36": "Content.SubCategory.Applied[3]",
+    "37": "Content.SubCategory.Interdisciplinary[0]",
+    "38": "Content.SubCategory.Interdisciplinary[1]",
+    "39": "Content.SubCategory.Interdisciplinary[2]",
+    "40": "Content.SubCategory.Interdisciplinary[3]",
+    "41": "Article",
+    "42": "Content.SubCategory.Core[0]",
+    "43": "Content.SubCategory.Core[1]",
+    "44": "Content.SubCategory.Core[2]",
+    "45": "Content.SubCategory.Core[3]",
+    "46": "Content.SubCategory.Core[4]",
+    "47": "Content.SubCategory.Applied[0]",
+    "48": "Content.SubCategory.Applied[1]",
+    "49": "Content.SubCategory.Applied[2]",
+    "50": "Content.SubCategory.Applied[3]",
+    "51": "Content.SubCategory.Interdisciplinary[0]",
+    "52": "Content.SubCategory.Interdisciplinary[1]",
+    "53": "Content.SubCategory.Interdisciplinary[2]",
+    "54": "Content.SubCategory.Interdisciplinary[3]",
+    "55": "Article",
+    "56": "Content.SubCategory.Core[0]",
+    "57": "Content.SubCategory.Core[1]",
+    "58": "Content.SubCategory.Core[2]",
+    "59": "Content.SubCategory.Core[3]",
+    "60": "Content.SubCategory.Core[4]",
+    "61": "Content.SubCategory.Applied[0]",
+    "62": "Content.SubCategory.Applied[1]",
+    "63": "Content.SubCategory.Applied[2]",
+    "64": "Content.SubCategory.Applied[3]",
+    "65": "Content.SubCategory.Interdisciplinary[0]",
+    "66": "Content.SubCategory.Interdisciplinary[1]",
+    "67": "Content.SubCategory.Interdisciplinary[2]",
+    "68": "Content.SubCategory.Interdisciplinary[3]",
+    "69": "Article",
+    "70": "Content.SubCategory.Core[0]",
+    "71": "Content.SubCategory.Core[1]",
+    "72": "Content.SubCategory.Core[2]",
+    "73": "Content.SubCategory.Core[3]",
+    "74": "Content.SubCategory.Core[4]",
+    "75": "Content.SubCategory.Applied[0]",
+    "76": "Content.SubCategory.Applied[1]",
+    "77": "Content.SubCategory.Applied[2]",
+    "78": "Content.SubCategory.Applied[3]",
+    "79": "Content.SubCategory.Interdisciplinary[0]",
+    "80": "Content.SubCategory.Interdisciplinary[1]",
+    "81": "Content.SubCategory.Interdisciplinary[2]",
+    "82": "Content.SubCategory.Interdisciplinary[3]",
+    "83": "Article",
+    "84": "Content.SubCategory.Core[0]",
+    "85": "Content.SubCategory.Core[1]",
+    "86": "Content.SubCategory.Core[2]",
+    "87": "Content.SubCategory.Core[3]",
+    "88": "Content.SubCategory.Core[4]",
+    "89": "Content.SubCategory.Applied[0]",
+    "90": "Content.SubCategory.Applied[1]",
+    "91": "Content.SubCategory.Applied[2]",
+    "92": "Content.SubCategory.Applied[3]",
+    "93": "Content.SubCategory.Interdisciplinary[0]",
+    "94": "Content.SubCategory.Interdisciplinary[1]",
+    "95": "Content.SubCategory.Interdisciplinary[2]",
+    "96": "Content.SubCategory.Interdisciplinary[3]",
+    "97": "Article",
+    "98": "Content.SubCategory.Core[0]",
+    "99": "Content.SubCategory.Core[1]",
+    "100": "Content.SubCategory.Core[2]",
+    "101": "Content.SubCategory.Core[3]",
+    "102": "Content.SubCategory.Core[4]",
+    "103": "Content.SubCategory.Applied[0]",
+    "104": "Content.SubCategory.Applied[1]",
+    "105": "Content.SubCategory.Applied[2]",
+    "106": "Content.SubCategory.Applied[3]",
+    "107": "Content.SubCategory.Interdisciplinary[0]",
+    "108": "Content.SubCategory.Interdisciplinary[1]",
+    "109": "Content.SubCategory.Interdisciplinary[2]",
+    "110": "Content.SubCategory.Interdisciplinary[3]",
+    "111": "Article",
+    "112": "Content.SubCategory.Core[0]",
+    "113": "Content.SubCategory.Core[1]",
+    "114": "Content.SubCategory.Core[2]",
+    "115": "Content.SubCategory.Core[3]",
+    "116": "Content.SubCategory.Core[4]",
+    "117": "Content.SubCategory.Applied[0]",
+    "118": "Content.SubCategory.Applied[1]",
+    "119": "Content.SubCategory.Applied[2]",
+    "120": "Content.SubCategory.Applied[3]",
+    "121": "Content.SubCategory.Interdisciplinary[0]",
+    "122": "Content.SubCategory.Interdisciplinary[1]",
+    "123": "Content.SubCategory.Interdisciplinary[2]",
+    "124": "Content.SubCategory.Interdisciplinary[3]",
+    "125": "Article",
+    "126": "Content.SubCategory.Core[0]",
+    "127": "Content.SubCategory.Core[1]",
+    "128": "Content.SubCategory.Core[2]",
+    "129": "Content.SubCategory.Core[3]",
+    "130": "Content.SubCategory.Core[4]",
+    "131": "Content.SubCategory.Applied[0]",
+    "132": "Content.SubCategory.Applied[1]",
+    "133": "Content.SubCategory.Applied[2]",
+    "134": "Content.SubCategory.Applied[3]",
+    "135": "Content.SubCategory.Interdisciplinary[0]",
+    "136": "Content.SubCategory.Interdisciplinary[1]",
+    "137": "Content.SubCategory.Interdisciplinary[2]",
+    "138": "Content.SubCategory.Interdisciplinary[3]",
+    "139": "Article",
+    "140": "Content.SubCategory.Core[0]",
+    "141": "Content.SubCategory.Core[1]",
+    "142": "Content.SubCategory.Core[2]",
+    "143": "Content.SubCategory.Core[3]",
+    "144": "Content.SubCategory.Core[4]",
+    "145": "Content.SubCategory.Applied[0]",
+    "146": "Content.SubCategory.Applied[1]",
+    "147": "Content.SubCategory.Applied[2]",
+    "148": "Content.SubCategory.Applied[3]",
+    "149": "Content.SubCategory.Interdisciplinary[0]",
+    "150": "Content.SubCategory.Interdisciplinary[1]",
+    "151": "Content.SubCategory.Interdisciplinary[2]",
+    "152": "Content.SubCategory.Interdisciplinary[3]",
+    "153": "Article",
+    "154": "Content.SubCategory.Core[0]",
+    "155": "Content.SubCategory.Core[1]",
+    "156": "Content.SubCategory.Core[2]",
+    "157": "Content.SubCategory.Core[3]",
+    "158": "Content.SubCategory.Core[4]",
+    "159": "Content.SubCategory.Applied[0]",
+    "160": "Content.SubCategory.Applied[1]",
+    "161": "Content.SubCategory.Applied[2]",
+    "162": "Content.SubCategory.Applied[3]",
+    "163": "Content.SubCategory.Interdisciplinary[0]",
+    "164": "Content.SubCategory.Interdisciplinary[1]",
+    "165": "Content.SubCategory.Interdisciplinary[2]",
+    "166": "Content.SubCategory.Interdisciplinary[3]"
+  }
+}

Demo/Assets/script.js ADDED Viewed

	@@ -0,0 +1,167 @@

+document.addEventListener("DOMContentLoaded", () => {
+    const chatBody = document.getElementById("chatBody");
+    const chatForm = document.getElementById("chatForm");
+    const fileInput = document.getElementById("fileUpload");
+    const fileCard = document.getElementById("fileCard");
+    const fileNameSpan = document.getElementById("fileName");
+    const removeFileBtn = document.getElementById("removeFile");
+    /**
+     * Gõ từng ký tự của một chuỗi văn bản vào một phần tử.
+     * @param {HTMLElement} element - Phần tử để gõ chữ vào.
+     * @param {string} text - Nội dung văn bản.
+     * @param {number} delay - Thời gian trễ giữa các ký tự (ms).
+     */
+    function typeWriter(element, text, delay = 10) {
+        let i = 0;
+        element.innerHTML = ""; // Xóa nội dung cũ
+        function typing() {
+            if (i < text.length) {
+                // Giữ lại các thẻ HTML như **, \n
+                if (text.substring(i, i + 2) === "**") {
+                    let boldEnd = text.indexOf("**", i + 2);
+                    if (boldEnd !== -1) {
+                        element.innerHTML += `<strong>${text.substring(i + 2, boldEnd)}</strong>`;
+                        i = boldEnd + 2;
+                    }
+                } else if (text.substring(i, i + 1) === "\n") {
+                     element.innerHTML += "<br>";
+                     i++;
+                }
+                else {
+                    element.innerHTML += text.charAt(i);
+                    i++;
+                }
+                chatBody.scrollTop = chatBody.scrollHeight; // Cuộn xuống khi gõ
+                setTimeout(typing, delay);
+            }
+        }
+        typing();
+    }
+    /**
+     * Tạo và nối một hàng tin nhắn mới vào thân chat.
+     * @param {string} sender - "user" hoặc "bot".
+     * @param {string} text - Nội dung văn bản của tin nhắn.
+     * @param {boolean} useTypewriter - Kích hoạt hiệu ứng gõ chữ cho bot.
+     */
+    function appendMessage(sender, text, useTypewriter = false) {
+        const messageRow = document.createElement("div");
+        const avatar = document.createElement("div");
+        const messageBubble = document.createElement("div")
+        messageRow.classList.add("message-row", `${sender}-row`);
+        avatar.classList.add("avatar");
+        avatar.textContent = "";
+        if (sender === 'bot'){
+            messageBubble.classList.add("bot-msg");
+        } else {
+            messageBubble.classList.add("user-msg");
+        }
+        if (sender === 'bot' && useTypewriter) {
+            typeWriter(messageBubble, text, 5);
+        } else {
+            // Thay thế markdown đơn giản cho hiển thị
+            const formattedText = text.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>').replace(/\n/g, '<br>');
+            messageBubble.innerHTML = formattedText;
+        }
+        messageRow.appendChild(avatar);
+        messageRow.appendChild(messageBubble);
+        chatBody.appendChild(messageRow);
+        chatBody.scrollTop = chatBody.scrollHeight;
+    }
+    /**
+     * Hiển thị chỉ báo đang gõ của bot.
+     */
+    function appendTyping() {
+        const typing = document.createElement("div");
+        typing.classList.add("typing");
+        typing.textContent = "AI đang xử lý...";
+        chatBody.appendChild(typing);
+        chatBody.scrollTop = chatBody.scrollHeight;
+        return typing;
+    }
+    // Chào mừng ban đầu
+    setTimeout(() => {
+        appendMessage("bot", "Xin chào! Hãy tải lên file PDF để tôi tóm tắt và phân loại cho bạn.", true);
+    }, 500);
+    // Khi chọn file, hiển thị card tên
+    fileInput.addEventListener("change", () => {
+        const file = fileInput.files[0];
+        if (file) {
+            fileCard.style.display = "flex";
+            fileNameSpan.textContent = file.name;
+        } else {
+            fileCard.style.display = "none";
+        }
+    });
+    // Nút xóa file
+    removeFileBtn.addEventListener("click", () => {
+        fileInput.value = "";
+        fileCard.style.display = "none";
+    });
+    /**
+     * Gửi file đến backend API để xử lý.
+     * @param {File} file - File PDF cần gửi.
+     */
+    async function sendFile(file) {
+        appendMessage("user", `Attached: ${file.name}`);
+        const typing = appendTyping();
+        const formData = new FormData();
+        formData.append("file", file);
+        try {
+            const response = await fetch("http://127.0.0.1:8000/process_pdf", {
+                method: "POST",
+                body: formData
+            });
+            const data = await response.json();
+            typing.remove();
+            if (response.ok && data.status == "success") {
+                if (data.checkstatus == 1) {
+                    appendMessage("bot", `✨**Chatbot**\n Đây là một văn bản về chủ đề **${data.category}** với nội dung được tóm tắt như sau: \n${data.summary}`, true);
+                } else {
+                    appendMessage(
+                        "bot",
+                        `✨**Chatbot**\n Văn bản không được chấp nhận:\n${data.summary}\n` +
+                        `Checkstatus: ${data.checkstatus}\n` +
+                        `Metrics:\n${JSON.stringify(data.metrics, null, 2)}`,
+                        true
+                    );
+                }
+            } else {
+                appendMessage("bot", `❌ **Lỗi:** ${data.message || "Không rõ"}`, true);
+            }
+        } catch (err) {
+            typing.remove();
+            appendMessage("bot", "⚠️ **Lỗi kết nối:** Không thể kết nối tới API!", true);
+        }
+    }
+    // Xử lý việc gửi form
+    chatForm.addEventListener("submit", async (e) => {
+        e.preventDefault();
+        const file = fileInput.files[0];
+        if (!file) {
+            appendMessage("bot", "⚠️ Vui lòng chọn một file PDF để xử lý.", true);
+            return;
+        }
+        await sendFile(file);
+        fileInput.value = "";
+        fileCard.style.display = "none";
+    });
+});

Demo/Assets/style.css ADDED Viewed

	@@ -0,0 +1,279 @@

+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap');
+:root {
+    --bg-primary: #0d1117;
+    --bg-secondary: #161b22;
+    --bg-tertiary: #21262d;
+    --border-color: #30363d;
+    --text-primary: #c9d1d9;
+    --text-secondary: #8b949e;
+    --accent-color: #58a6ff;
+    --user-msg-bg: #21262d;
+    --bot-msg-bg: #161b22;
+    --danger-color: #f85149;
+}
+* {
+    margin: 0;
+    padding: 0;
+    box-sizing: border-box;
+}
+body {
+    font-family: 'Inter', sans-serif;
+    background-color: var(--bg-primary);
+    color: var(--text-primary);
+    overflow: hidden;
+}
+.app-container {
+    display: grid;
+    grid-template-columns: 240px 1fr; /* Bố cục 5 cột có thể quá phức tạp, 2 cột (1 cho menu, 1 cho chat) hiệu quả hơn */
+    height: 100vh;
+}
+/* --- Sidebar (Menu) --- */
+.sidebar {
+    background-color: var(--bg-secondary);
+    border-right: 1px solid var(--border-color);
+    padding: 20px;
+}
+.sidebar-header h3 {
+    font-size: 1.2rem;
+    margin-bottom: 24px;
+}
+.menu ul {
+    list-style-type: none;
+}
+.menu li a {
+    display: block;
+    padding: 10px 15px;
+    text-decoration: none;
+    color: var(--text-secondary);
+    border-radius: 6px;
+    transition: background-color 0.2s, color 0.2s;
+}
+.menu li a:hover {
+    background-color: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+.menu li.active a {
+    background-color: rgba(88, 166, 255, 0.1);
+    color: var(--accent-color);
+    font-weight: 500;
+}
+/* --- Chat Container --- */
+.chat-container {
+    display: flex;
+    flex-direction: column;
+    background-color: var(--bg-primary);
+}
+.chat-header {
+    padding: 20px 30px;
+    border-bottom: 1px solid var(--border-color);
+    background-color: var(--bg-secondary);
+}
+.chat-header h2 {
+    font-size: 1.25rem;
+}
+.chat-header p {
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+.chat-body {
+    flex-grow: 1;
+    overflow-y: auto;
+    padding: 20px 30px;
+}
+/* --- Tin nhắn --- */
+.message-row {
+    display: flex;
+    align-items: flex-start;
+    gap: 15px;
+    margin-bottom: 20px;
+    max-width: 80%;
+}
+.bot-row {
+    justify-content: flex-start;
+}
+.user-row {
+    justify-content: flex-end;
+    margin-left: auto; /* Đẩy tin nhắn người dùng sang phải */
+}
+.avatar {
+    width: 36px;
+    height: 36px;
+    border-radius: 50%;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 1.2rem;
+    background-color: var(--bg-tertiary);
+}
+.bot-msg, .user-msg {
+    padding: 12px 16px;
+    border-radius: 18px;
+    line-height: 1.6;
+    border: 1px solid var(--border-color);
+}
+.bot-msg {
+    background-color: var(--bot-msg-bg);
+    border-top-left-radius: 4px;
+}
+.user-msg {
+    background-color: var(--user-msg-bg);
+    border-top-right-radius: 4px;
+}
+/* --- Hiệu ứng đang gõ --- */
+.typing {
+    color: var(--text-secondary);
+    padding: 10px 30px;
+    font-style: italic;
+    animation: pulse 1.5s infinite ease-in-out;
+}
+@keyframes pulse {
+    0% { opacity: 0.5; }
+    50% { opacity: 1; }
+    100% { opacity: 0.5; }
+}
+/* --- Chat Footer & Form --- */
+.chat-footer {
+    padding: 15px 30px;
+    border-top: 1px solid var(--border-color);
+    background-color: var(--bg-secondary);
+}
+.chat-form {
+    display: flex;
+    align-items: center;
+    gap: 10px;
+    background-color: var(--bg-primary);
+    border-radius: 8px;
+    padding: 5px;
+    border: 1px solid var(--border-color);
+}
+.input-wrapper {
+    flex-grow: 1;
+    position: relative;
+    display: flex;
+    align-items: center;
+}
+.input-wrapper input[type="text"] {
+    width: 100%;
+    background: none;
+    border: none;
+    outline: none;
+    color: var(--text-primary);
+    font-size: 1rem;
+    padding: 10px;
+}
+.input-wrapper input[type="text"]:disabled {
+    cursor: not-allowed;
+    color: var(--text-secondary);
+}
+.icon-btn, .send-btn {
+    background: none;
+    border: none;
+    color: var(--text-secondary);
+    cursor: pointer;
+    padding: 8px;
+    border-radius: 6px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: background-color 0.2s, color 0.2s;
+}
+.icon-btn:hover {
+    background-color: var(--bg-tertiary);
+    color: var(--text-primary);
+}
+.send-btn {
+    background-color: var(--accent-color);
+    color: var(--bg-primary);
+    border-radius: 50%;
+    width: 38px;
+    height: 38px;
+    transition: background-color 0.2s;
+}
+.send-btn:hover {
+    background-color: #79c0ff;
+}
+.send-btn svg {
+    width: 20px;
+    height: 20px;
+    transform: translateX(1px);
+}
+/* --- File Card --- */
+#fileCard {
+    display: none; /* Ẩn mặc định */
+    align-items: center;
+    gap: 8px;
+    background-color: var(--bg-tertiary);
+    padding: 5px 10px;
+    border-radius: 6px;
+    margin-left: 10px;
+    font-size: 0.9rem;
+    position: absolute;
+}
+#fileName {
+    max-width: 200px;
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+}
+#removeFile {
+    background: none;
+    border: none;
+    color: var(--text-secondary);
+    cursor: pointer;
+    font-size: 1.2rem;
+    line-height: 1;
+}
+#removeFile:hover {
+    color: var(--danger-color);
+}
+/* --- Scrollbar --- */
+.chat-body::-webkit-scrollbar {
+    width: 8px;
+}
+.chat-body::-webkit-scrollbar-track {
+    background: var(--bg-secondary);
+}
+.chat-body::-webkit-scrollbar-thumb {
+    background-color: var(--bg-tertiary);
+    border-radius: 10px;
+    border: 2px solid var(--bg-secondary);
+}

Demo/index.html ADDED Viewed

	@@ -0,0 +1,63 @@

+<!DOCTYPE html>
+<html lang="vi">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>PDF Chatbot</title>
+    <link rel="stylesheet" href="Assets/style.css">
+    <script src="https://kit.fontawesome.com/a076d05399.js" crossorigin="anonymous"></script> </head>
+<body>
+    <div class="app-container">
+        <aside class="sidebar">
+            <div class="sidebar-header">
+                <h3>Menu</h3>
+            </div>
+            <nav class="menu">
+                <ul>
+                    <li class="active"><a href="#">PDF Assistant</a></li>
+                    <li><a href="#">⚙️ Cài đặt</a></li>
+                    <li><a href="#">❓ Trợ giúp</a></li>
+                </ul>
+            </nav>
+        </aside>
+        <main class="chat-container">
+            <header class="chat-header">
+                <h2>PDF Assistant</h2>
+                <p>Tải lên tài liệu của bạn để tóm tắt và phân loại.</p>
+            </header>
+            <div id="chatBody" class="chat-body">
+                </div>
+            <footer class="chat-footer">
+                <form id="chatForm" class="chat-form">
+                    <label for="fileUpload" class="icon-btn" title="Đính kèm file PDF">
+                        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><line x1="12" y1="5" x2="12" y2="19"></line><line x1="5" y1="12" x2="19" y2="12"></line></svg>
+                    </label>
+                    <input type="file" id="fileUpload" accept=".pdf" hidden>
+                    <button type="button" class="icon-btn" title="Công cụ (Tóm tắt, Hỏi đáp)">
+                        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M14.7 6.3a1 1 0 0 0 0 1.4l1.6 1.6a1 1 0 0 0 1.4 0l3.77-3.77a6 6 0 0 1-7.94 7.94l-6.91 6.91a2.12 2.12 0 0 1-3-3l6.91-6.91a6 6 0 0 1 7.94-7.94l-3.76 3.76z"></path></svg>
+                    </button>
+                    <div class="input-wrapper">
+                        <div id="fileCard" class="file-card">
+                            <span class="file-icon">📄</span>
+                            <span id="fileName"></span>
+                            <button type="button" id="removeFile" class="remove-file-btn">&times;</button>
+                        </div>
+                        <input type="text" placeholder="" disabled>
+                    </div>
+                    <button type="submit" class="send-btn" title="Gửi">
+                        <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="currentColor"><path d="M3.478 2.405a.75.75 0 00-.926.94l2.432 7.905H13.5a.75.75 0 010 1.5H4.984l-2.432 7.905a.75.75 0 00.926.94 60.519 60.519 0 0018.445-8.986.75.75 0 000-1.218A60.517 60.517 0 003.478 2.405z"></path></svg>
+                    </button>
+                </form>
+            </footer>
+        </main>
+    </div>
+    <script src="Assets/script.js"></script>
+</body>
+</html>

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+# ---- Base image (CUDA-enabled; works on GPU runners). For CPU, Hugging Face will still run it. ----
+FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
+# Avoid interactive tzdata prompts
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+# System deps (faiss-cpu works; for faiss-gpu you may switch below)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git git-lfs build-essential poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Enable Git LFS (Spaces uses it automatically, but good to ensure)
+RUN git lfs install
+# Workdir
+WORKDIR /app
+# Copy only requirement files first to leverage Docker layer caching
+COPY requirements.txt ./
+# Install Python deps
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest
+COPY . .
+# Expose the port set by Spaces via $PORT
+ENV HOST=0.0.0.0
+ENV PORT=7860
+# Optional envs (override in Space Secrets)
+ENV HF_TOKEN=""
+ENV API_SECRET=""
+# Start the server
+CMD ["/bin/bash", "start.sh"]

Environment/bruh.yml ADDED Viewed

	@@ -0,0 +1,108 @@

+# ======================================
+# Conda Environment for Document AI API
+# ======================================
+name: bruh
+channels:
+  - nvidia
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  # --- Python base ---
+  - python=3.11
+  - pip
+  - setuptools
+  - wheel
+  # --- Core DS stack ---
+  - numpy
+  - pandas
+  - scipy
+  - scikit-learn
+  - matplotlib
+  - seaborn
+  - networkx
+  - sympy
+  - pillow
+  - statsmodels
+  - joblib
+  # --- GPU / Deep Learning ---
+  - pytorch=2.3.1
+  - torchvision=0.18.1
+  - torchaudio=2.3.1
+  - pytorch-cuda=12.1  # GPU CUDA 12.1 runtime (NVIDIA driver >= 530)
+  # --- System & Utility ---
+  - psutil
+  - pyyaml
+  - requests
+  - filelock
+  # --- Jupyter (tùy chọn, có thể bỏ nếu deploy server) ---
+  - jupyterlab
+  - ipython
+  - ipykernel
+  - ipywidgets
+  # --- NLP, Transformers, Search, Web ---
+  - pip:
+      # ===============================
+      # Transformers Ecosystem
+      # ===============================
+      - torch==2.3.1
+      - torchvision==0.18.1
+      - torchaudio==2.3.1
+      - transformers==4.44.2
+      - sentence-transformers==3.0.1
+      - tokenizers>=0.19.1
+      - huggingface-hub>=0.23.4
+      - safetensors>=0.4.3
+      - accelerate==0.31.0
+      - datasets>=2.19.0
+      - evaluate>=0.4.2
+      - sentencepiece>=0.2.0
+      - protobuf>=4.25.2
+      - nltk>=3.9
+      - rouge-score>=0.1.2
+      # ===============================
+      # Semantic Search / FAISS
+      # ===============================
+      - faiss-gpu==1.8.0
+      # ===============================
+      # PDF / Text Processing
+      # ===============================
+      - PyMuPDF==1.24.9
+      - underthesea==6.8.4
+      - regex>=2024.5.15
+      - tqdm
+      - openpyxl
+      - lxml
+      - beautifulsoup4
+      # ===============================
+      # Web Framework / API
+      # ===============================
+      - fastapi==0.115.0
+      - uvicorn[standard]==0.30.6
+      - starlette>=0.37.2
+      - flask
+      - flask-cors
+      # ===============================
+      # Hugging Face / Google / Utils
+      # ===============================
+      - google-generativeai
+      - fsspec
+      - cloudpathlib
+      - multiprocess
+      - dill
+      - typer
+      - rich
+      # --- Vietnamese NLP ---
+      - underthesea==6.8.4

Environment/env.yml ADDED Viewed

	@@ -0,0 +1,83 @@

+# conda env create -f env.yml
+name: bruh
+channels:
+  - conda-forge
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  # --- Nền tảng & DS ---
+  - python=3.11
+  - pip
+  - numpy
+  - pandas
+  - scipy
+  - matplotlib
+  - seaborn
+  - scikit-learn
+  - statsmodels
+  - joblib
+  - networkx
+  - sympy
+  - pillow
+  # --- Deep Learning / GPU ---
+  - pytorch
+  - torchvision
+  - torchaudio
+  - pytorch-cuda=12.1
+  # --- Notebook & Jupyter ---
+  - jupyterlab
+  - ipython
+  - ipykernel
+  - ipywidgets
+  # --- Tiện ích hệ thống ---
+  - psutil
+  - pyyaml
+  - filelock
+  - requests
+  # --- Các gói bổ sung cài đặt bằng Pip ---
+  - pip:
+    - transformers==4.41.2
+    - datasets
+    - evaluate
+    - accelerate==0.31.0
+    - sentence-transformers
+    - spacy
+    - faiss-cpu
+    - google-generativeai
+    - beautifulsoup4
+    - tqdm
+    - openpyxl
+    - requests
+    - lxml
+    - rouge-score
+    - sentencepiece
+    - protobuf
+    - ipywidgets
+    - PyMuPDF
+    - fastapi
+    - uvicorn[standard]
+    # --- NLP & HuggingFace ecosystem ---
+    - huggingface-hub
+    - safetensors
+    - tokenizers
+    - nltk
+    # --- Web / API phụ trợ ---
+    - starlette
+    - typer
+    - rich
+    # --- Tiện ích khác ---
+    - regex
+    - multiprocess
+    - dill
+    - fsspec
+    - cloudpathlib

Libraries/Common_MyUtils.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import logging
+import re, os
+import pandas as pd
+import json, csv, openpyxl
+from typing import Dict, List, Any, Tuple
+from collections import Counter
+# ===============================
+# 0. ERROR CATCHER
+# ===============================
+def exc(func, fallback=None):
+    """
+    Thực thi func() an toàn.
+    Nếu lỗi → log exception (e) và trả về fallback.
+    """
+    try:
+        return func()
+    except Exception as e:
+        logging.warning(e)
+        return fallback
+# ===============================
+# 1. JSON
+# ===============================
+def read_json(path: str) -> Any:
+    if not os.path.exists(path):
+        return []
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+def write_json(data: Any, path: str, indent: int = 2) -> None:
+    dir_path = os.path.dirname(path)
+    if dir_path: os.makedirs(dir_path, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=indent, ensure_ascii=False)
+def insert_json(data: Any, path: str, indent: int = 2):
+    dir_path = os.path.dirname(path)
+    if dir_path: os.makedirs(dir_path, exist_ok=True)
+    with open(path, 'a', encoding='utf-8') as f:
+        json.dump(data, f, indent=indent, ensure_ascii=False)
+# ===============================
+# 2. JSONL
+# ===============================
+def read_jsonl(path: str) -> List[dict]:
+    with open(path, "r", encoding="utf-8") as f:
+        return [json.loads(line) for line in f]
+def write_jsonl(data: List[dict], path: str) -> None:
+    dir_path = os.path.dirname(path)
+    if dir_path: os.makedirs(dir_path, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        for item in data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+def insert_jsonl(data: List[dict], path: str):
+    dir_path = os.path.dirname(path)
+    if dir_path: os.makedirs(dir_path, exist_ok=True)
+    with open(path, 'a', encoding='utf-8') as f:
+        for item in data:
+            f.write(json.dumps(item, ensure_ascii=False) + '\n')
+# ===============================
+# 3. CSV
+# ===============================
+def read_csv(path: str) -> List[dict]:
+    with open(path, "r", encoding="utf-8", newline="") as f:
+        return list(csv.DictReader(f))
+def write_csv(data: List[dict], path: str) -> None:
+    dir_path = os.path.dirname(path)
+    if dir_path: os.makedirs(dir_path, exist_ok=True)
+    if not data:
+        return
+    with open(path, "w", encoding="utf-8", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=data[0].keys())
+        writer.writeheader()
+        writer.writerows(data)
+# ===============================
+# 4.XLSX
+# ===============================
+def read_xlsx(path: str, sheet_name: str = None) -> List[dict]:
+    wb = openpyxl.load_workbook(path)
+    sheet = wb[sheet_name] if sheet_name else wb.active
+    rows = list(sheet.values)
+    headers = rows[0]
+    return [dict(zip(headers, row)) for row in rows[1:]]
+def write_xlsx(data: List[dict], path: str, sheet_name: str = "Sheet1") -> None:
+    dir_path = os.path.dirname(path)
+    if dir_path: os.makedirs(dir_path, exist_ok=True)
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = sheet_name
+    if not data:
+        wb.save(path)
+        return
+    ws.append(list(data[0].keys()))
+    for row in data:
+        ws.append(list(row.values()))
+    wb.save(path)
+def convert_to_xlsx(json_path, xlsx_path):
+    os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
+    """Chuyển file JSON (dạng list các object) hoặc JSONL sang XLSX."""
+    try:
+        if json_path.endswith('.jsonl'):
+            df = pd.read_json(json_path, lines=True)
+        else:
+            df = pd.read_json(json_path)
+        column_order = ["category", "sub_category", "url", "title", "description", "content", "date", "words"]
+        df = df[[col for col in column_order if col in df.columns]]
+        df.to_excel(xlsx_path, index=False, engine='openpyxl')
+        print(f"-> Đã xuất thành công file Excel tại {xlsx_path}")
+    except (FileNotFoundError, ValueError) as e:
+        print(f"-> Không có dữ liệu hoặc lỗi khi chuyển sang Excel: {e}")
+# ===============================
+# 5. Convert
+# ===============================
+def json_convert(data: Any, pretty: bool = True) -> str:
+    return json.dumps(data, ensure_ascii=False, indent=2 if pretty else None)
+def jsonl_convert(data: List[dict]) -> str:
+    return "\n".join(json.dumps(item, ensure_ascii=False) for item in data)
+# ===============================
+# 6. Sort
+# ===============================
+def sort_records(data: List[dict], keys: List[str]) -> List[dict]:
+    """Sắp xếp theo nhiều keys với ưu tiên từ trái sang phải"""
+    return sorted(data, key=lambda x: tuple(x.get(k) for k in keys))
+# ===============================
+# 7. Most Common
+# ===============================
+def most_common(values):
+    if not values:
+        return None
+    return Counter(values).most_common(1)[0][0]
+DEFAULT_NON_KEEP_PATTERN = re.compile(r"[^\w\s\(\)\.\,\;\:\-–]", flags=re.UNICODE)
+def preprocess_text(
+    text: Any,
+    non_keep_pattern: re.Pattern = DEFAULT_NON_KEEP_PATTERN,
+    max_chars_per_text: int | None = None,
+) -> Any:
+    """
+    Làm sạch chuỗi: strip, bỏ ký tự không mong muốn, rút gọn khoảng trắng.
+    Vẫn cho phép list/dict đi qua để hàm preprocess_data xử lý đệ quy.
+    """
+    if isinstance(text, list):
+        # Truyền tiếp đủ tham số khi gọi đệ quy
+        return [preprocess_text(t, non_keep_pattern=non_keep_pattern, max_chars_per_text=max_chars_per_text) for t in text]
+    if isinstance(text, str):
+        s = text.strip()  # <-- sửa từ s = strip()
+        s = non_keep_pattern.sub("", s)
+        s = re.sub(r"[ ]{2,}", " ", s)
+        if max_chars_per_text is not None and len(s) > max_chars_per_text:
+            s = s[: max_chars_per_text]
+        return s
+    return text
+def preprocess_data(
+    data: Any,
+    non_keep_pattern: re.Pattern = DEFAULT_NON_KEEP_PATTERN,
+    max_chars_per_text: int | None = None,
+) -> Any:
+    """Đệ quy tiền xử lý lên toàn bộ JSON."""
+    if isinstance(data, dict):
+        return {
+            k: preprocess_data(v, non_keep_pattern=non_keep_pattern, max_chars_per_text=max_chars_per_text)
+            for k, v in data.items()
+        }
+    if isinstance(data, list):
+        return [
+            preprocess_data(x, non_keep_pattern=non_keep_pattern, max_chars_per_text=max_chars_per_text)
+            for x in data
+        ]
+    return preprocess_text(data, non_keep_pattern=non_keep_pattern, max_chars_per_text=max_chars_per_text)
+# ===============================
+# 9. Json
+# ===============================
+def flatten_json(
+    data: Any,
+    prefix: str = "",
+    flatten_mode: str = "split",  # mặc định: tách từng phần tử list
+    join_sep: str = "\n",         # mặc định: xuống dòng khi join list
+) -> Dict[str, Any]:
+    """
+    Làm phẳng JSON với xử lý list theo flatten_mode.
+    - "split": mỗi phần tử list tạo key riêng: a.b[0], a.b[1], ...
+               Nếu phần tử là dict/list → tiếp tục flatten (được lồng chỉ số).
+    - "join":  join list về 1 chuỗi (join_sep). (Phần tử không phải str sẽ str())
+    - "keep":  giữ nguyên list (chỉ gán 1 key cho toàn list).
+    Trả về: dict key->giá trị (lá).
+    """
+    flat: Dict[str, Any] = {}
+    def _recur(node: Any, pfx: str) -> None:
+        if isinstance(node, dict):
+            for k, v in node.items():
+                new_pfx = f"{pfx}{k}" if not pfx else f"{pfx}.{k}"
+                _recur(v, new_pfx)
+            return
+        if isinstance(node, list):
+            if flatten_mode == "split":
+                for i, item in enumerate(node):
+                    idx_key = f"{pfx}[{i}]"
+                    _recur(item, idx_key)
+            elif flatten_mode == "join":
+                joined = join_sep.join(str(x).strip() for x in node if str(x).strip())
+                flat[pfx] = joined
+            else:  # "keep"
+                flat[pfx] = node
+            return
+        # lá: số/chuỗi/None/...
+        flat[pfx] = node
+    _recur(data, prefix.rstrip("."))
+    return flat
+def deduplicates_by_key(pairs: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+    """
+    Lọc trùng theo value trong cùng key (hoặc base_key).
+    Giữ lại **lần xuất hiện đầu tiên** của mỗi (key, text),
+    loại bỏ những dòng có cùng key và cùng text lặp lại sau đó.
+    Args:
+        pairs: Danh sách (key, text) sau khi flatten.
+    Returns:
+        Danh sách (key, text) đã loại bỏ trùng lặp.
+    """
+    seen_per_key: Dict[str, set] = {}
+    filtered: List[Tuple[str, str]] = []
+    for key, text in pairs:
+        text_norm = text.strip()
+        if not text_norm:
+            continue
+        base_key = re.sub(r"\[\d+\]", "", key)
+        if base_key not in seen_per_key:
+            seen_per_key[base_key] = set()
+        if text_norm in seen_per_key[base_key]:
+            continue
+        seen_per_key[base_key].add(text_norm)
+        filtered.append((key, text_norm))
+    return filtered

Libraries/Common_PdfProcess.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from . import Common_MyUtils as MyUtils
+# ===============================
+# 1. General
+# ===============================
+def fontFlags(span):
+    """Trả về tuple booleans (bold, italic, underline) từ span.flags"""
+    flags = span.get("flags", 0)
+    b = bool(flags & 16)
+    i = bool(flags & 2)
+    u = bool(flags & 8)
+    return b, i, u
+def setAlign(position, regionWidth):
+    mid = abs(position["Mid"])
+    left = position["Left"]
+    if mid <= 0.01 * regionWidth:
+        if left > 0.01 * regionWidth:
+            return "Center"
+        else:
+            return "Justify"
+    elif position["Mid"] > 0.01 * regionWidth:
+        return "Right"
+    else:
+        return "Left"
+def setPosition(line, prev_line, next_line, xStart, xEnd, xMid):
+    left = round(line["Coords"]["X0"] - xStart, 1)
+    right = round(xEnd - line["Coords"]["X1"], 1)
+    mid = round(line["Coords"]["XM"] - xMid, 1)
+    top = round(line["Coords"]["Y1"] - prev_line["Coords"]["Y1"], 1) if prev_line else 0
+    bot = round(next_line["Coords"]["Y1"] - line["Coords"]["Y1"], 1) if next_line else 0
+    return (left, right, mid, top, bot)
+# ===============================
+# 2. Words
+# ===============================
+def extractWords(line):
+    """Trả về list [(word, span)] theo thứ tự trong line; giữ nguyên dấu câu."""
+    spans = line.get("spans", [])
+    full_text = line.get("text", "")
+    if not spans or not full_text.strip():
+        return []
+    # chỉ giữ spans có chữ thật
+    valid_spans = [s for s in spans if s.get("text", "").strip()]
+    if not valid_spans:
+        valid_spans = spans
+    words = []
+    for s in valid_spans:
+        for raw in s.get("text", "").split():
+            words.append((raw, s))
+    return words
+def getWordText(line, index: int):
+    """Lấy Text của từ tại vị trí index (hỗ trợ index âm)."""
+    words = extractWords(line)
+    if -len(words) <= index < len(words):
+        return words[index][0]
+    return ""
+def getWordFontSize(line, index: int):
+    """Lấy FontSize của từ tại vị trí index."""
+    words = extractWords(line)
+    if -len(words) <= index < len(words):
+        _, span = words[index]
+        return round(span.get("size", 12.0), 1)
+    return 0.0
+def getWordCoord(line, index: int):
+    """Lấy tọa độ (x0, x1, xm, y0, y1) của từ tại vị trí index (dựa bbox của span chứa từ)."""
+    words = extractWords(line)
+    if -len(words) <= index < len(words):
+        _, span = words[index]
+        x0, y0, x1, y1 = span["bbox"]
+        x0, y0, x1, y1 = round(x0, 1), round(y0, 1), round(x1, 1), round(y1, 1)
+        return (x0, x1, y0, y1)
+    return (0, 0, 0, 0)
+# ===============================
+# 3. Lines
+# ===============================
+def getLineFontSize(line):
+    """FontSize của line = mean FontSize các từ (làm tròn 0.5)."""
+    words = extractWords(line)
+    if not words:
+        return 12.0
+    sizes = [span.get("size", 12.0) for _, span in words]
+    avg = sum(sizes) / len(sizes)
+    return round(avg * 2) / 2
+def getLineCoord(line):
+    """
+    Coord của line:
+      - x0 = x0 của từ đầu tiên
+      - x1 = x1 của từ cuối cùng
+      - y0 = min(y0) các từ
+      - y1 = max(y1) các từ
+      - xm = (x0 + x1) / 2
+    """
+    words = extractWords(line)
+    if not words:
+        return (0, 0, 0, 0, 0)
+    coords = []
+    for _, span in words:
+        x0, y0, x1, y1 = span["bbox"]
+        coords.append((round(x0, 1), round(y0, 1), round(x1, 1), round(y1, 1)))
+    x0 = coords[0][0]
+    x1 = coords[-1][2]
+    y0 = min(c[1] for c in coords)
+    y1 = max(c[3] for c in coords)
+    xm = round((x0 + x1) / 2, 1)
+    return (x0, x1, xm, y0, y1)
+def setLineSize(line):
+    x0, x1, y0, y1 = line["Coords"]["X0"], line["Coords"]["X1"], line["Coords"]["Y0"], line["Coords"]["Y1"]
+    return (round(x1 - x0, 1), round(y1 - y0, 1))
+# ===============================
+# 4. Page
+# ===============================
+def setPageCoords(lines, pageGeneralSize):
+    x0s = [round(l["Coords"]["X0"], 1) for l in lines]
+    x1s = [round(l["Coords"]["X1"], 1) for l in lines]
+    y0s = [round(l["Coords"]["Y0"], 1) for l in lines]
+    y1s = [round(l["Coords"]["Y1"], 1) for l in lines]
+    xStart = MyUtils.most_common(x0s)
+    page_width = pageGeneralSize[1]
+    threshold = page_width * 0.75
+    x1_candidates = [x for x in x1s if x >= threshold]
+    xEnd = MyUtils.most_common(x1_candidates) if x1_candidates else max(x1s)
+    yStart = min(y0s)
+    yEnd = max(y1s)
+    xMid = round((xStart + xEnd) / 2, 1)
+    yMid = round((yStart + yEnd) / 2, 1)
+    return (xStart, yStart, xEnd, yEnd, xMid, yMid)
+def setPageRegionSize(xStart, yStart, xEnd, yEnd):
+    return (round(xEnd - xStart, 1), round(yEnd - yStart, 1))

Libraries/Common_TextProcess.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import re
+from difflib import SequenceMatcher
+from . import Common_MyUtils as MyUtils
+ex = MyUtils.exc
+# ===============================
+# 1. Abbreviation
+# ===============================
+# Phụ âm đầu
+VALID_ONSETS = [
+    "b", "c", "ch", "d", "đ", "g", "gh", "gi",
+    "h", "k", "kh", "l", "m", "n", "ng", "ngh",
+    "nh", "p", "ph", "q", "r", "s", "t", "th",
+    "tr", "v", "x"
+]
+# Nguyên âm
+VALID_NUCLEI = [
+    "a", "ă", "â", "e", "ê", "i", "o", "ô", "ơ", "u", "ư", "y",
+    "ia", "iê", "ya", "ya", "ua", "uô", "ưa", "ươ",
+    "ai", "ao", "au", "ay", "âu", "ây",
+    "eo", "êu",
+    "ia", "iê", "yê",
+    "oi", "ôi", "ơi",
+    "ua", "uô", "ươ", "ưu", "uy", "uya"
+]
+# Phụ âm cuối
+VALID_CODAS = ["c", "ch", "m", "n", "ng", "nh", "p", "t"]
+# ===== Hàm kiểm tra viết tắt =====
+def is_abbreviation(word: str) -> bool:
+    """
+    Trả về True nếu từ KHÔNG phải âm tiết tiếng Việt chuẩn,
+    tức là có khả năng là viết tắt.
+    Quy tắc:
+    1. Không có nguyên âm hoặc nguyên âm không hợp lệ -> viết tắt
+    2. Phụ âm đầu không hợp lệ -> viết tắt
+    3. Phụ âm cuối không hợp lệ -> viết tắt
+    4. Nhiều hơn 3 phần (đầu - nguyên âm - cuối) -> viết tắt
+    """
+    w = word.lower()
+    w = re.sub(r'[^a-zăâêôơưđ]', '', w)
+    if not w:
+        return True
+    # 1. Tìm phụ âm đầu
+    onset = None
+    for o in sorted(VALID_ONSETS, key=len, reverse=True):
+        if w.startswith(o):
+            onset = o
+            break
+    rest = w[len(onset):] if onset else w
+    if onset is None and rest and rest[0] not in "aeiouyăâêôơư":
+        return True  # phụ âm đầu không hợp lệ
+    # 2. Tìm phụ âm cuối
+    coda = None
+    for c in sorted(VALID_CODAS, key=len, reverse=True):
+        if rest.endswith(c):
+            coda = c
+            break
+    nucleus = rest[:-len(coda)] if coda else rest
+    # 3. Kiểm tra nguyên âm
+    if not nucleus:
+        return True
+    if nucleus not in VALID_NUCLEI:
+        return True
+    # 4. Kiểm tra số phần
+    parts = [p for p in [onset, nucleus, coda] if p]
+    if len(parts) > 3:
+        return True
+    return False
+# ===============================
+# 2. Words
+# ===============================
+# ===== Hàm chuẩn hóa từ ======================
+def normalize_word(w: str) -> str:
+    return re.sub(r'[^A-Za-zÀ-ỹĐđ0-9]', '', w)
+# ===== Hàm so sánh độ tương đồng =============
+def similar(a, b):
+    return SequenceMatcher(None, a, b).ratio()
+# ===== Hàm chuyển số La Mã ===================
+def is_roman(s):
+    return bool(re.fullmatch(r'[IVXLC]+', s))
+# ===== Chuyển số La Mã sang số Ả Rập =========
+def roman_to_int(s):
+    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100}
+    result, prev = 0, 0
+    for c in reversed(s):
+        val = roman_numerals.get(c, 0)
+        if val < prev:
+            result -= val
+        else:
+            result += val
+            prev = val
+    return result
+# ===== Hàm loại bỏ khoảng trắng thừa =========
+def strip_extra_spaces(s: str) -> str:
+    if not isinstance(s, str):
+        return s
+    return re.sub(r'\s+', ' ', s).strip()
+def merge_txt(RawDataDict, JsonKey, JsonField):
+    paragraphs = RawDataDict.get(JsonKey, [])
+    merged = "\n".join(p.get(JsonField, "").strip() for p in paragraphs if p.get(JsonField))
+    merged = re.sub(r"\n{2,}", "\n", merged.strip())
+    return merged

Libraries/Faiss_ChunkMapping.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from typing import Dict, List, Any, Optional, Iterable
+# --------- A. Tiện ích cơ bản ---------
+def _ordered_unique_chunk_ids(reranked: List[Dict[str, Any]]) -> List[int]:
+    seen, ordered = set(), []
+    for r in reranked:
+        for cid in r.get("chunk_ids", []):
+            if isinstance(cid, (int, str)) and str(cid).isdigit():
+                cid = int(cid)
+                if cid not in seen:
+                    seen.add(cid)
+                    ordered.append(cid)
+    return ordered
+def _filter_fields_recursive(obj: Any, drop_lower: set) -> Any:
+    """Loại bỏ các field có tên xuất hiện trong drop_lower (case-insensitive) trên toàn cấu trúc."""
+    if isinstance(obj, dict):
+        return {
+            k: _filter_fields_recursive(v, drop_lower)
+            for k, v in obj.items()
+            if k.lower() not in drop_lower
+        }
+    if isinstance(obj, list):
+        return [_filter_fields_recursive(x, drop_lower) for x in obj]
+    return obj
+def _iter_values_no_keys(obj: Any) -> Iterable[str]:
+    """Duyệt đệ quy, chỉ yield GIÁ TRỊ (bỏ key), split theo '\n' nếu là chuỗi."""
+    if isinstance(obj, dict):
+        for v in obj.values():
+            yield from _iter_values_no_keys(v)
+    elif isinstance(obj, list):
+        for item in obj:
+            yield from _iter_values_no_keys(item)
+    elif isinstance(obj, str):
+        for line in obj.splitlines():
+            yield line
+    else:
+        yield str(obj)
+def _get_by_path(obj: Any, path: str) -> Any:
+    """
+    Lấy giá trị theo path kiểu 'A.B.C'.
+    - Nếu gặp list trong quá trình đi xuống → thu thập giá trị từ từng phần tử (map-collect).
+    - Nếu path không tồn tại → trả về None.
+    """
+    parts = path.split(".")
+    def _step(o, idx=0):
+        if idx == len(parts):
+            return o
+        key = parts[idx]
+        if isinstance(o, dict):
+            if key not in o:
+                return None
+            return _step(o[key], idx + 1)
+        if isinstance(o, list):
+            collected = []
+            for it in o:
+                collected.append(_step(it, idx))
+            # gộp phẳng các None
+            flat = []
+            for v in collected:
+                if v is None:
+                    continue
+                if isinstance(v, list):
+                    flat.extend(v)
+                else:
+                    flat.append(v)
+            return flat
+        return None
+    return _step(obj, 0)
+# --------- B. Các hàm chính ---------
+def extract_chunks_from_rerank_flexible(
+    reranked_results: List[Dict[str, Any]],
+    SegmentDict: List[Dict[str, Any]],
+    n_chunks: Optional[int] = None,
+    drop_fields: Optional[List[str]] = None,
+) -> List[Dict[str, Any]]:
+    """
+    - Lấy chunk theo thứ tự từ reranked.
+    - Giới hạn số lượng chunk gốc trả về bằng n_chunks (nếu có).
+    - Áp dụng bỏ trường theo drop_fields (toàn bộ cấu trúc).
+    - Kết quả: [{"chunk_id": int, "data": <json đã lọc>}]
+    """
+    if not reranked_results:
+        return []
+    ordered_ids = _ordered_unique_chunk_ids(reranked_results)
+    if n_chunks is not None:
+        ordered_ids = ordered_ids[:int(n_chunks)]
+    drop_lower = set(x.lower() for x in (drop_fields or []))
+    out = []
+    seen = set()
+    for cid in ordered_ids:
+        if cid in seen:
+            continue
+        seen.add(cid)
+        if 1 <= cid <= len(SegmentDict):
+            data = SegmentDict[cid - 1]
+            filtered = _filter_fields_recursive(data, drop_lower) if drop_lower else data
+            out.append({"chunk_id": cid, "data": filtered})
+    return out
+def collect_chunk_text(chunks: List[Dict[str, Any]]) -> str:
+    """Biến toàn bộ danh sách chunk thành text (bỏ key, split dòng)."""
+    if not chunks:
+        return "(Không có chunk nào)"
+    lines: List[str] = []
+    for ch in chunks:
+        for line in _iter_values_no_keys(ch["data"]):
+            lines.append(line)
+        lines.append("")
+    return "\n".join(lines).strip()
+def extract_fields_for_each_chunk(
+    chunks: List[Dict[str, Any]],
+    fields: Optional[List[str]] = None,
+) -> List[Dict[str, Any]]:
+    """
+    - Với mỗi chunk gốc, lấy những TRƯỜNG được truyền vào (hỗ trợ path 'A.B.C').
+    - Nếu fields=None → lấy TẤT CẢ top-level fields còn lại trong chunk['data'].
+    - Trả về list theo từng chunk: {"chunk_id": ..., "fields": {...}}
+    """
+    results = []
+    for ch in chunks:
+        data = ch["data"]
+        if not isinstance(data, dict):
+            results.append({"chunk_id": ch["chunk_id"], "fields": data})
+            continue
+        if fields is None:
+            payload = {k: v for k, v in data.items()}
+        else:
+            payload = {}
+            for f in fields:
+                payload[f] = _get_by_path(data, f)
+        results.append({"chunk_id": ch["chunk_id"], "fields": payload})
+    return results
+def process_chunks_pipeline(
+    reranked_results: List[Dict[str, Any]],
+    SegmentDict: List[Dict[str, Any]],
+    drop_fields: Optional[List[str]] = None,     # Trường bị bỏ qua (áp dụng toàn bộ)
+    fields: Optional[List[str]] = None,          # Trường muốn trích xuất (None → tất cả top-level)
+    n_chunks: Optional[int] = None               # Số lượng chunk gốc & text (nếu None → tất cả)
+) -> Dict[str, Any]:
+    """
+    Trả về:
+      - chunks_json: đúng số lượng chunk gốc (đã drop_fields)
+      - chunks_text: text từ cùng số lượng chunk (bỏ key, split dòng)
+      - extracted_fields: các trường được chỉ định cho mỗi chunk
+    """
+    # 1️⃣ Lấy chunk gốc (JSON)
+    chunks_json = extract_chunks_from_rerank_flexible(
+        reranked_results=reranked_results,
+        SegmentDict=SegmentDict,
+        n_chunks=n_chunks,
+        drop_fields=drop_fields,
+    )
+    # 2️⃣ Biến thành text (cùng số lượng chunk)
+    chunks_text = collect_chunk_text(chunks_json)
+    # 3️⃣ Lấy các trường cụ thể
+    extracted_fields = extract_fields_for_each_chunk(chunks_json, fields=fields)
+    return {
+        "chunks_json": chunks_json,          # JSON chuẩn
+        "chunks_text": chunks_text,          # text của cùng số lượng chunk
+        "extracted_fields": extracted_fields # field được chọn
+    }

Libraries/Faiss_Embedding.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import logging
+import re, os
+import torch
+import faiss
+import numpy as np
+from typing import Dict, List, Any, Tuple, Optional
+from . import Common_MyUtils as MyUtils
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+class DirectFaissIndexer:
+    """
+        1) FaissPath (.faiss): chỉ chứa vectors,
+        2) MapDataPath (.json): content + index,
+        3) MappingPath (.json): ánh xạ key <-> index.
+    """
+    def __init__(
+        self,
+        indexer: Any,
+        device: str = "cpu",
+        batch_size: int = 32,
+        show_progress: bool = False,
+        flatten_mode: str = "split",
+        join_sep: str = "\n",
+        allowed_schema_types: Tuple[str, ...] = ("string", "array", "dict"),
+        max_chars_per_text: Optional[int] = None,
+        normalize: bool = True,
+        verbose: bool = False,
+        list_policy: str = "split", # "merge" | "split"
+    ):
+        self.indexer = indexer
+        self.device = device
+        self.batch_size = batch_size
+        self.show_progress = show_progress
+        self.flatten_mode = flatten_mode
+        self.join_sep = join_sep
+        self.allowed_schema_types = allowed_schema_types
+        self.max_chars_per_text = max_chars_per_text
+        self.normalize = normalize
+        self.verbose = verbose
+        self.list_policy = list_policy
+        self._non_keep_pattern = re.compile(r"[^\w\s\(\)\.\,\;\:\-–]", flags=re.UNICODE)
+    # ---------- Schema & chọn trường ----------
+    @staticmethod
+    def _base_key_for_schema(key: str) -> str:
+        return re.sub(r"\[\d+\]", "", key)
+    def _eligible_by_schema(self, key: str, schema: Optional[Dict[str, str]]) -> bool:
+        if schema is None:
+            return True
+        base_key = self._base_key_for_schema(key)
+        typ = schema.get(base_key)
+        return (typ in self.allowed_schema_types) if typ is not None else False
+    # ---------- Tiền xử lý & flatten ----------
+    def _preprocess_data(self, data: Any) -> Any:
+        if MyUtils and hasattr(MyUtils, "preprocess_data"):
+            return MyUtils.preprocess_data(
+                data,
+                non_keep_pattern=self._non_keep_pattern,
+                max_chars_per_text=self.max_chars_per_text
+            )
+    def _flatten_json(self, data: Any) -> Dict[str, Any]:
+        """
+        Flatten JSON theo list_policy:
+        - merge: gộp list/dict chứa chuỗi thành 1 đoạn text duy nhất
+        - split: tách từng phần tử
+        """
+        # Nếu merge, xử lý JSON trước khi flatten
+        if self.list_policy == "merge":
+            def _merge_lists(obj):
+                if isinstance(obj, dict):
+                    return {k: _merge_lists(v) for k, v in obj.items()}
+                elif isinstance(obj, list):
+                    # Nếu list chỉ chứa chuỗi / số, gộp lại
+                    if all(isinstance(i, (str, int, float)) for i in obj):
+                        return self.join_sep.join(map(str, obj))
+                    # Nếu list chứa dict hoặc list lồng, đệ quy
+                    return [_merge_lists(v) for v in obj]
+                else:
+                    return obj
+            data = _merge_lists(data)
+        # Sau đó gọi MyUtils.flatten_json như cũ
+        return MyUtils.flatten_json(
+            data,
+            prefix="",
+            flatten_mode=self.flatten_mode,
+            join_sep=self.join_sep
+        )
+    # ---------- Encode (batch) với fallback OOM CPU ----------
+    def _encode_texts(self, texts: List[str]) -> torch.Tensor:
+        try:
+            embs = self.indexer.encode(
+                sentences=texts,
+                batch_size=self.batch_size,
+                convert_to_tensor=True,
+                device=self.device,
+                show_progress_bar=self.show_progress,
+            )
+            return embs
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                print("⚠️ CUDA OOM → fallback CPU.")
+                try:
+                    self.indexer.to("cpu")
+                except Exception:
+                    pass
+                embs = self.indexer.encode(
+                    sentences=texts,
+                    batch_size=self.batch_size,
+                    convert_to_tensor=True,
+                    device="cpu",
+                    show_progress_bar=self.show_progress,
+                )
+                return embs
+            raise
+    # ---------- Build FAISS ----------
+    @staticmethod
+    def _l2_normalize(mat: np.ndarray) -> np.ndarray:
+        norms = np.linalg.norm(mat, axis=1, keepdims=True)
+        norms[norms == 0.0] = 1.0
+        return mat / norms
+    def _create_faiss_index(self, matrix: np.ndarray) -> faiss.Index:
+        dim = int(matrix.shape[1])
+        index = faiss.IndexFlatIP(dim)
+        index.add(matrix.astype("float32"))
+        return index
+    # ================================================================
+    #  Hàm lọc trùng nhưng vẫn gom nhóm chunk tương ứng
+    # ================================================================
+    def deduplicates_with_mask(
+        self,
+        pairs: List[Tuple[str, str]],
+        chunk_map: List[int]
+    ) -> Tuple[List[Tuple[str, str]], List[List[int]]]:
+        assert len(pairs) == len(chunk_map), "pairs và chunk_map phải đồng dài"
+        seen_per_key: Dict[str, Dict[str, int]] = {}
+        # base_key -> text_norm -> index trong filtered_pairs
+        filtered_pairs: List[Tuple[str, str]] = []
+        chunk_groups: List[List[int]] = []  # song song với filtered_pairs
+        for (key, text), c in zip(pairs, chunk_map):
+            text_norm = text.strip()
+            if not text_norm:
+                continue
+            base_key = re.sub(r"\[\d+\]", "", key)
+            if base_key not in seen_per_key:
+                seen_per_key[base_key] = {}
+            # Nếu text đã xuất hiện → thêm chunk vào nhóm cũ
+            if text_norm in seen_per_key[base_key]:
+                idx = seen_per_key[base_key][text_norm]
+                if c not in chunk_groups[idx]:
+                    chunk_groups[idx].append(c)
+                continue
+            # Nếu chưa có → tạo mới
+            seen_per_key[base_key][text_norm] = len(filtered_pairs)
+            filtered_pairs.append((key, text_norm))
+            chunk_groups.append([c])
+        return filtered_pairs, chunk_groups
+    # ================================================================
+    #  Ghi ChunkMapping
+    # ================================================================
+    def write_chunk_mapping(self, MapChunkPath: str, SegmentPath: str, chunk_groups: List[List[int]]) -> None:
+        # Ghi chunk mapping dạng gọn: mỗi index một dòng
+        with open(MapChunkPath, "w", encoding="utf-8") as f:
+            f.write('{\n')
+            f.write('  "index_to_chunk": {\n')
+            items = list(enumerate(chunk_groups))
+            for i, (idx, group) in enumerate(items):
+                group_str = "[" + ", ".join(map(str, group)) + "]"
+                comma = "," if i < len(items) - 1 else ""
+                f.write(f'    "{idx}": {group_str}{comma}\n')
+            f.write('  },\n')
+            f.write('  "meta": {\n')
+            f.write(f'    "count": {len(chunk_groups)},\n')
+            f.write(f'    "source": "{os.path.basename(SegmentPath)}"\n')
+            f.write('  }\n')
+            f.write('}\n')
+    # ================================================================
+    #  Hàm build_from_json
+    # ================================================================
+    def build_from_json(
+        self,
+        SegmentPath: str,
+        SchemaDict: Optional[str],
+        FaissPath: str,
+        MapDataPath: str,
+        MappingPath: str,
+        MapChunkPath: Optional[str] = None,
+    ) -> None:
+        assert os.path.exists(SegmentPath), f"Không thấy file JSON: {SegmentPath}"
+        os.makedirs(os.path.dirname(FaissPath), exist_ok=True)
+        os.makedirs(os.path.dirname(MapDataPath), exist_ok=True)
+        os.makedirs(os.path.dirname(MappingPath), exist_ok=True)
+        if MapChunkPath:
+            os.makedirs(os.path.dirname(MapChunkPath), exist_ok=True)
+        schema = SchemaDict
+        # 1️⃣ Read JSON
+        data_obj = MyUtils.read_json(SegmentPath)
+        data_list = data_obj if isinstance(data_obj, list) else [data_obj]
+        # 2️⃣ Flatten + lưu chunk_id
+        pair_list: List[Tuple[str, str]] = []
+        chunk_map: List[int] = []
+        for chunk_id, item in enumerate(data_list, start=1):
+            processed = self._preprocess_data(item)
+            flat = self._flatten_json(processed)
+            for k, v in flat.items():
+                if not self._eligible_by_schema(k, schema):
+                    continue
+                if isinstance(v, str) and v.strip():
+                    pair_list.append((k, v.strip()))
+                    chunk_map.append(chunk_id)
+        if not pair_list:
+            raise ValueError("Không tìm thấy nội dung văn bản hợp lệ để encode.")
+        # 3️⃣ Loại trùng nhưng gom nhóm chunk
+        pair_list, chunk_groups = self.deduplicates_with_mask(pair_list, chunk_map)
+        # 4️⃣ Encode
+        keys  = [k for k, _ in pair_list]
+        texts = [t for _, t in pair_list]
+        embs_t = self._encode_texts(texts)
+        embs = embs_t.detach().cpu().numpy()
+        if self.normalize:
+            embs = self._l2_normalize(embs)
+        # 5️⃣ FAISS
+        index = self._create_faiss_index(embs)
+        faiss.write_index(index, FaissPath)
+        logging.info(f"✅ Đã xây FAISS: {FaissPath}")
+        # 6️⃣ Mapping + MapData
+        index_to_key = {str(i): k for i, k in enumerate(keys)}
+        Mapping = {
+            "meta": {
+                "count": len(keys),
+                "dim": int(embs.shape[1]),
+                "metric": "ip",
+                "normalized": bool(self.normalize),
+            },
+            "index_to_key": index_to_key,
+        }
+        MapData = {
+            "items": [{"index": i, "key": k, "text": t} for i, (k, t) in enumerate(pair_list)],
+            "meta": {
+                "count": len(keys),
+                "flatten_mode": self.flatten_mode,
+                "schema_used": schema is not None,
+                "list_policy": self.list_policy
+            }
+        }
+        self.write_chunk_mapping(MapChunkPath, SegmentPath, chunk_groups)
+        return Mapping, MapData

Libraries/Faiss_Searching.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import faiss
+import numpy as np
+from typing import Dict, List, Any, Optional
+from sentence_transformers import SentenceTransformer, CrossEncoder
+class SemanticSearchEngine:
+    def __init__(
+        self,
+        indexer: SentenceTransformer,
+        reranker: Optional[CrossEncoder] = None,
+        device: str = "cuda",
+        normalize: bool = True,
+        top_k: int = 20,
+        rerank_k: int = 10,
+        rerank_batch_size: int = 16,
+    ):
+        self.device = device
+        self.normalize = normalize
+        self.top_k = int(top_k)
+        self.rerank_k = int(rerank_k)
+        self.rerank_batch_size = int(rerank_batch_size)
+        # ✅ Nhận trực tiếp model đã load
+        if not isinstance(indexer, SentenceTransformer):
+            raise TypeError("indexer phải là SentenceTransformer đã load sẵn.")
+        self._indexer = indexer
+        # Reranker là tùy chọn
+        if reranker and not isinstance(reranker, CrossEncoder):
+            raise TypeError("reranker phải là CrossEncoder hoặc None.")
+        self.reranker = reranker
+    # ---------------------------
+    # Tiện ích nội bộ
+    # ---------------------------
+    @staticmethod
+    def _l2_normalize(x: np.ndarray, axis: int = 1, eps: float = 1e-12) -> np.ndarray:
+        denom = np.linalg.norm(x, axis=axis, keepdims=True)
+        denom = np.maximum(denom, eps)
+        return x / denom
+    @staticmethod
+    def _build_idx_maps(Mapping: Dict[str, Any], MapData: Dict[str, Any]):
+        """Tạo ánh xạ index→text và index→key"""
+        items = MapData.get("items", [])
+        idx2text = {int(item["index"]): item.get("text", None) for item in items}
+        raw_i2k = Mapping.get("index_to_key", {})
+        idx2key = {int(i): k for i, k in raw_i2k.items()}
+        return idx2text, idx2key
+    # ---------------------------
+    # 1️⃣ SEARCH: FAISS vector search
+    # ---------------------------
+    def search(
+        self,
+        query: str,
+        faissIndex: "faiss.Index",  # type: ignore
+        Mapping: Dict[str, Any],
+        MapData: Dict[str, Any],
+        MapChunk: Optional[Dict[str, Any]] = None,
+        top_k: Optional[int] = None,
+        query_embedding: Optional[np.ndarray] = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        Trả về:
+            [{"index":..., "key":..., "text":..., "faiss_score":...}, ...]
+        """
+        k = int(top_k or self.top_k)
+        # 1. Encode truy vấn (hoặc dùng sẵn embedding)
+        if query_embedding is None:
+            q = self._indexer.encode(
+                [query], convert_to_tensor=True, device=str(self.device)
+            )
+            q = q.detach().cpu().numpy().astype("float32")
+        else:
+            q = np.asarray(query_embedding, dtype="float32")
+            if q.ndim == 1:
+                q = q[None, :]
+        # 2. Normalize nếu dùng cosine
+        if self.normalize:
+            q = self._l2_normalize(q)
+        # 3. Search FAISS
+        scores, ids = faissIndex.search(q, k)
+        idx2text, idx2key = self._build_idx_maps(Mapping, MapData)
+        # 4. Mapping kết quả
+        chunk_map = MapChunk.get("index_to_chunk", {}) if MapChunk else {}
+        results = []
+        for score, idx in zip(scores[0].tolist(), ids[0].tolist()):
+            chunk_ids = chunk_map.get(str(idx), [])
+            results.append({
+                "index": int(idx),
+                "key": idx2key.get(int(idx)),
+                "text": idx2text.get(int(idx)),
+                "faiss_score": float(score),
+                "chunk_ids": chunk_ids,
+            })
+        return results
+    # ---------------------------
+    # 2️⃣ RERANK: CrossEncoder rerank
+    # ---------------------------
+    def rerank(
+        self,
+        query: str,
+        results: List[Dict[str, Any]],
+        top_k: Optional[int] = None,
+        show_progress: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """
+        Xếp hạng lại kết quả bằng CrossEncoder (nếu có).
+        Trả về danh sách top_k kết quả đã rerank.
+        """
+        if not results:
+            return []
+        if self.reranker is None:
+            raise ValueError("⚠️ Không có reranker được cung cấp khi khởi tạo.")
+        k = int(top_k or self.rerank_k)
+        pairs = []
+        valid_indices = []
+        for i, r in enumerate(results):
+            text = r.get("text")
+            if isinstance(text, str) and text.strip():
+                pairs.append([query, text])
+                valid_indices.append(i)
+        if not pairs:
+            return []
+        scores = self.reranker.predict(
+            pairs, batch_size=self.rerank_batch_size, show_progress_bar=show_progress
+        )
+        for i, s in zip(valid_indices, scores):
+            results[i]["rerank_score"] = float(s)
+        reranked = [r for r in results if "rerank_score" in r]
+        reranked.sort(key=lambda x: x["rerank_score"], reverse=True)
+        return reranked[:k]

Libraries/Json_ChunkMaster.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from collections import OrderedDict
+from copy import deepcopy
+class ChunkBuilder:
+    def readInput(self, RawLvlsDict=None, RawDataDict=None):
+        # Đọc dữ liệu
+        self.struct_spec = RawLvlsDict[0]
+        self.paragraphs = sorted(
+            RawDataDict.get("paragraphs", []),
+            key=lambda x: x.get("Paragraph", 0)
+        )
+        # Chuẩn bị cấu trúc
+        self.ordered_fields = list(self.struct_spec.keys())
+        self.last_field = self.ordered_fields[-1]
+        self.level_fields = self.ordered_fields[:-1]
+        # Tập marker cho từng field
+        self.marker_dict = {}
+        for fld in self.ordered_fields:
+            vals = self.struct_spec.get(fld, [])
+            self.marker_dict[fld] = set(vals) if isinstance(vals, list) else set()
+        # Biến tạm
+        self.StructDict = []
+        self.index_counter = 1
+    # ===== Các hàm tiện ích =====
+    def _new_temp(self):
+        return {fld: "" for fld in self.level_fields} | {self.last_field: []}
+    def _temp_has_data(self, temp):
+        return any(temp[f].strip() for f in self.level_fields) or bool(temp[self.last_field])
+    def _reset_deeper(self, temp, touched_field):
+        idx = self.level_fields.index(touched_field)
+        for f in self.level_fields[idx+1:]:
+            temp[f] = ""
+        temp[self.last_field] = []
+    def _has_data_from_level(self, temp, fld):
+        """Kiểm tra từ level fld trở xuống có dữ liệu không"""
+        if fld not in self.level_fields:
+            return False
+        idx = self.level_fields.index(fld)
+        for f in self.level_fields[idx:]:
+            if temp[f].strip():
+                return True
+        if temp[self.last_field]:
+            return True
+        return False
+    def _with_index(self, temp, idx):
+        """Tạo OrderedDict với Index đứng đầu"""
+        od = OrderedDict()
+        od["Index"] = idx
+        for f in self.level_fields:
+            od[f] = temp[f]
+        od[self.last_field] = temp[self.last_field]
+        return od
+    # ===== Hàm chính =====
+    def build(self, RawLvlsDict=None, RawDataDict=None):
+        self.readInput(RawLvlsDict, RawDataDict)
+        temp = self._new_temp()
+        for p in self.paragraphs:
+            text = p.get("Text") or ""
+            marker = p.get("MarkerType", None) or "none"
+            matched_field = None
+            for fld in self.level_fields:
+                if marker in self.marker_dict.get(fld, set()):
+                    matched_field = fld
+                    break
+            if matched_field is not None:
+                if self._has_data_from_level(temp, matched_field):
+                    self.StructDict.append(self._with_index(deepcopy(temp), self.index_counter))
+                    self.index_counter += 1
+                temp[matched_field] = text
+                self._reset_deeper(temp, matched_field)
+            else:
+                temp[self.last_field].append(text)
+        if self._temp_has_data(temp):
+            self.StructDict.append(self._with_index(deepcopy(temp), self.index_counter))
+            self.index_counter += 1
+        return self.StructDict

Libraries/Json_ChunkUnder.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import re
+import numpy as np
+from underthesea import sent_tokenize
+class ChunkUndertheseaBuilder:
+    """
+    Bộ tách văn bản tiếng Việt thông minh:
+      1️⃣ Lọc trước (Extractive): chỉ giữ các câu có ý chính
+      2️⃣ Gộp sau (Semantic): nhóm các câu trọng tâm theo ngữ nghĩa
+    """
+    def __init__(self,
+                 embedder,
+                 device: str = "cpu",
+                 min_words: int = 256,
+                 max_words: int = 768,
+                 sim_threshold: float = 0.7,
+                 key_sent_ratio: float = 0.4):
+        if embedder is None:
+            raise ValueError("❌ Cần truyền mô hình embedder đã load sẵn.")
+        self.embedder = embedder
+        self.device = device
+        self.min_words = min_words
+        self.max_words = max_words
+        self.sim_threshold = sim_threshold
+        self.key_sent_ratio = key_sent_ratio
+    # ============================================================
+    # 1️⃣ Tách câu
+    # ============================================================
+    def _split_sentences(self, text: str):
+        """Tách câu tiếng Việt (fallback nếu underthesea lỗi)."""
+        text = re.sub(r"[\x00-\x1f]+", " ", text)
+        try:
+            sents = sent_tokenize(text)
+        except Exception:
+            sents = re.split(r"(?<=[.!?])\s+", text)
+        return [s.strip() for s in sents if len(s.strip()) > 2]
+    # ============================================================
+    # 2️⃣ Encode an toàn (GPU/CPU fallback)
+    # ============================================================
+    def _encode(self, sentences):
+        try:
+            return self.embedder.encode(
+                sentences,
+                convert_to_numpy=True,
+                show_progress_bar=False,
+                device=str(self.device)
+            )
+        except TypeError:
+            return self.embedder.encode(sentences, convert_to_numpy=True, show_progress_bar=False)
+        except RuntimeError as e:
+            if "CUDA" in str(e):
+                print("⚠️ GPU OOM, fallback sang CPU.")
+                return self.embedder.encode(
+                    sentences, convert_to_numpy=True, show_progress_bar=False, device="cpu"
+                )
+            raise e
+    # ============================================================
+    # 3️⃣ Lọc ý chính trước (EXTRACTIVE)
+    # ============================================================
+    def _extractive_filter(self, sentences):
+        """Chọn ra top-k câu đại diện nội dung nhất."""
+        if len(sentences) <= 3:
+            return sentences
+        embeddings = self._encode(sentences)
+        mean_vec = np.mean(embeddings, axis=0)
+        sims = np.dot(embeddings, mean_vec) / (
+            np.linalg.norm(embeddings, axis=1) * np.linalg.norm(mean_vec)
+        )
+        # Chọn top-k câu có similarity cao nhất
+        k = max(1, int(len(sentences) * self.key_sent_ratio))
+        idx = np.argsort(-sims)[:k]
+        idx.sort()  # giữ thứ tự gốc
+        selected = [sentences[i] for i in idx]
+        return selected
+    # ============================================================
+    # 4️⃣ Gộp các câu trọng tâm theo ngữ nghĩa
+    # ============================================================
+    def _semantic_group(self, sentences):
+        """Gộp các câu đã lọc theo mức tương đồng ngữ nghĩa."""
+        if not sentences:
+            return []
+        embeddings = self._encode(sentences)
+        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+        chunks, cur_chunk, cur_len = [], [], 0
+        for i, sent in enumerate(sentences):
+            wc = len(sent.split())
+            if not cur_chunk:
+                cur_chunk.append(sent)
+                cur_len = wc
+                continue
+            sim = np.dot(embeddings[i - 1], embeddings[i])
+            too_long = cur_len + wc > self.max_words
+            too_short = cur_len < self.min_words
+            topic_changed = sim < self.sim_threshold
+            if too_long or (not too_short and topic_changed):
+                chunks.append(" ".join(cur_chunk))
+                cur_chunk = [sent]
+                cur_len = wc
+            else:
+                cur_chunk.append(sent)
+                cur_len += wc
+        if cur_chunk:
+            chunks.append(" ".join(cur_chunk))
+        return chunks
+    # ============================================================
+    # 5️⃣ Hàm chính build()
+    # ============================================================
+    def build(self, full_text: str):
+        """
+        Trả về list chứa {Index, Content} cho từng chunk.
+        Quy trình:
+            - Lọc câu trọng tâm trước
+            - Gộp các câu đã lọc theo ngữ nghĩa
+        """
+        all_sentences = self._split_sentences(full_text)
+        print(f"📄 Tổng số câu: {len(all_sentences)}")
+        # --- Bước 1: lọc ý chính ---
+        filtered = self._extractive_filter(all_sentences)
+        print(f"✨ Giữ lại {len(filtered)} câu (~{len(filtered)/len(all_sentences):.0%}) sau extractive filter")
+        # --- Bước 2: gộp thành các đoạn ngữ nghĩa ---
+        chunks = self._semantic_group(filtered)
+        results = [{"Index": i, "Content": chunk} for i, chunk in enumerate(chunks, start=1)]
+        print(f"🔹 Tạo {len(results)} chunk ngữ nghĩa từ {len(filtered)} câu trọng tâm.")
+        return results

Libraries/Json_GetStructures.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import re
+from typing import Dict, List, Any
+from collections import Counter, defaultdict
+class StructureAnalyzer:
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+    # ---------------- B1 ---------------- #
+    def extract_markers(self, RawDataDict) -> List[str]:
+        bullet_pattern = re.compile(r"^\s*[-•●♦▪‣–—]+\s*$")
+        paragraphs = RawDataDict.get("paragraphs", [])
+        common_markers = set(RawDataDict.get("general", {}).get("commonMarkers", []))
+        raw_markers: List[Any] = []
+        for p in paragraphs:
+            mt = p.get("MarkerText")
+            mtype = p.get("MarkerType")
+            # Bỏ bullet
+            if bullet_pattern.match(mt or "") or bullet_pattern.match(mtype or ""):
+                continue
+            # Giữ nếu thuộc common hoặc là None
+            if mtype in common_markers or mtype is None:
+                raw_markers.append(mtype)
+        # Loại bỏ trùng kề nhau và chuẩn hóa None -> "none"
+        cleaned: List[str] = []
+        prev = object()
+        for m in raw_markers:
+            val = str(m) if m is not None else "none"
+            if val != prev:
+                cleaned.append(val)
+                prev = val
+        return cleaned
+    # ---------------- B2 ---------------- #
+    def build_structures(self, markers: List[str]) -> List[Dict[str, Any]]:
+        unique_markers = list(dict.fromkeys(markers))
+        counter1 = Counter(markers)
+        results = [{"Depth": 1, "Structure": [m], "Count": counter1[m]} for m in unique_markers]
+        max_depth = len(unique_markers)
+        prev_structures = set((m,) for m in unique_markers)
+        for i in range(2, max_depth + 1):
+            counter = Counter()
+            for j in range(len(markers) - i + 1):
+                seq_raw = tuple(markers[j:j+i])
+                prefix = seq_raw[:-1]
+                # Điều kiện 1: phải có cha
+                if prefix not in prev_structures:
+                    continue
+                # Điều kiện 2: không trùng MarkerType trong cùng cấu trúc
+                if len(seq_raw) != len(set(seq_raw)):
+                    continue
+                # Điều kiện 3: chỉ chấp nhận nếu "none" không có, hoặc nằm ở cuối
+                if "none" in seq_raw and seq_raw[-1] != "none":
+                    continue
+                counter[seq_raw] += 1
+            if not counter:
+                break
+            min_count = min(counter.values())
+            max_count = max(counter.values())
+            filtered = {s: f for s, f in counter.items() if not (f == min_count and f != max_count)}
+            sorted_structs = sorted(filtered.items(), key=lambda x: x[1], reverse=True)
+            depth_lines = [{"Depth": i, "Structure": list(s), "Count": f} for s, f in sorted_structs]
+            results.extend(depth_lines)
+            prev_structures = set(s for s, _ in sorted_structs)
+        return results
+    # ---------------- B3 ---------------- #
+    def deduplicate(self, structures: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        grouped = defaultdict(list)
+        for item in structures:
+            depth = item["Depth"]
+            key = (depth, tuple(sorted(item["Structure"])))
+            grouped[key].append(item)
+        filtered = []
+        for _, group in grouped.items():
+            best = max(group, key=lambda x: x["Count"])
+            filtered.append(best)
+        filtered.sort(key=lambda x: (x["Depth"], -x["Count"], x["Structure"]))
+        return filtered
+    # ---------------- B4 ---------------- #
+    def select_top(self, dedup: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        if not dedup:
+            return []
+        max_depth = max(item["Depth"] for item in dedup)
+        at_max = [x for x in dedup if x["Depth"] == max_depth]
+        max_count = max(x["Count"] for x in at_max)
+        top = [x for x in at_max if x["Count"] == max_count]
+        result = []
+        for t in top:
+            level_dict = {}
+            for i, marker in enumerate(t["Structure"]):
+                if i == len(t["Structure"]) - 1:
+                    # phần tử cuối cùng
+                    level_dict["Contents"] = marker
+                else:
+                    level_dict[f"Level {i+1}"] = marker
+            result.append(level_dict)
+        return result
+    def level_rank(level: str) -> int:
+        """Quy đổi level thành số để so sánh"""
+        if level == "Contents":
+            return 9999  # Contents coi như cao nhất
+        if level.startswith("Level "):
+            try:
+                return int(level.split()[1])
+            except Exception:
+                return 0
+        return 0
+    def extend_top(self, top: List[Dict[str, Any]], dedup: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Mở rộng top bằng cách thêm tail từ dedup:
+        - Nếu Contents: chỉ giữ tail == ['none']
+        - Các level khác: thêm tail vào các level tiếp theo
+        - Nếu level đã có -> gộp vào list
+        - Luôn chuẩn hóa: mọi giá trị là list
+        """
+        if not top:
+            return []
+        RawLvlsDict = dict(top[0])  # copy để tránh sửa trực tiếp
+        all_markers = set(v for val in RawLvlsDict.values() for v in (val if isinstance(val, list) else [val]))
+        seen_tails = set()
+        # snapshot tránh lỗi "dict changed size"
+        snapshot_items = list(RawLvlsDict.items())
+        for level, marker_values in reversed(snapshot_items):
+            if level == "Level 1":
+                continue
+            # chuẩn hóa về list để dễ xử lý
+            markers = marker_values if isinstance(marker_values, list) else [marker_values]
+            for marker in markers:
+                for d in dedup:
+                    struct = d["Structure"]
+                    if d["Depth"] < 2:
+                        continue
+                    if struct and struct[0] == marker:
+                        if not (set(struct) & (all_markers - {marker})):
+                            tail = tuple(struct[1:])
+                            # xử lý riêng cho Contents
+                            if level == "Contents" and tail != ("none",):
+                                continue
+                            if tail in seen_tails:
+                                continue
+                            seen_tails.add(tail)
+                            # xác định base level
+                            if level.startswith("Level "):
+                                base_level_num = int(level.split()[1])
+                            elif level == "Contents":
+                                base_level_num = max(
+                                    int(l.split()[1]) for l in RawLvlsDict if l.startswith("Level ")
+                                )
+                            else:
+                                base_level_num = 0
+                            # thêm từng phần tử tail vào level tiếp theo
+                            for i, t in enumerate(tail, start=1):
+                                next_level = f"Level {base_level_num+i}"
+                                if next_level not in RawLvlsDict:
+                                    RawLvlsDict[next_level] = []
+                                if not isinstance(RawLvlsDict[next_level], list):
+                                    RawLvlsDict[next_level] = [RawLvlsDict[next_level]]
+                                if t not in RawLvlsDict[next_level]:
+                                    RawLvlsDict[next_level].append(t)
+        # đổi level cao nhất thành Contents (và gộp nếu đã có)
+        level_nums = [int(l.split()[1]) for l in RawLvlsDict if l.startswith("Level ")]
+        if level_nums:
+            max_level = f"Level {max(level_nums)}"
+            new_contents = RawLvlsDict.pop(max_level)
+            if "Contents" not in RawLvlsDict:
+                RawLvlsDict["Contents"] = []
+            if not isinstance(RawLvlsDict["Contents"], list):
+                RawLvlsDict["Contents"] = [RawLvlsDict["Contents"]]
+            for v in (new_contents if isinstance(new_contents, list) else [new_contents]):
+                if v not in RawLvlsDict["Contents"]:
+                    RawLvlsDict["Contents"].append(v)
+        # --- 🔹 Đổi nhãn ngay trước khi trả kết quả --- #
+        keys = list(RawLvlsDict.keys())
+        if len(keys) > 1 and keys[-2].startswith("Level "):
+            RawLvlsDict["Article"] = RawLvlsDict.pop(keys[-2])
+        if "Contents" in RawLvlsDict:
+            RawLvlsDict["Content"] = RawLvlsDict.pop("Contents")
+        # chuẩn hóa tất cả value thành list
+        for k, v in RawLvlsDict.items():
+            if not isinstance(v, list):
+                RawLvlsDict[k] = [v]
+        return [RawLvlsDict]

Libraries/Json_SchemaExt.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from typing import Dict, List, Any
+class JSONSchemaExtractor:
+    def __init__(self, list_policy: str = "first", verbose: bool = True) -> None:
+        """
+        :param list_policy: "first" | "union"
+            - "first": nếu gặp list các object, lấy schema theo PHẦN TỬ ĐẦU (như bản gốc).
+            - "union": duyệt mọi phần tử, hợp nhất các field/type.
+        """
+        assert list_policy in ("first", "union"), "list_policy must be 'first' or 'union'"
+        self.list_policy = list_policy
+        self.verbose = verbose
+        self._processed_fields: set[str] = set()
+        self._full_schema: Dict[str, str] = {}
+    # =====================================
+    # 1) Chuẩn hóa kiểu dữ liệu
+    # =====================================
+    @staticmethod
+    def get_standard_type(value: Any) -> str:
+        if isinstance(value, bool):
+            return "boolean"
+        elif isinstance(value, int):
+            return "number"
+        elif isinstance(value, float):
+            return "number"
+        elif isinstance(value, str):
+            return "string"
+        elif isinstance(value, list):
+            return "array"
+        elif isinstance(value, dict):
+            return "object"
+        elif value is None:
+            return "null"
+        return "unknown"
+    # =====================================
+    # 2) Hợp nhất kiểu (null / mixed)
+    # =====================================
+    def _merge_type(self, key: str, new_type: str, item_index: int) -> None:
+        """
+        Cập nhật self._full_schema[key] theo quy tắc:
+         - Nếu chưa có: đặt = new_type và log "New: ..."
+         - Nếu khác:
+             + Nếu new_type == "null": giữ kiểu cũ.
+             + Nếu kiểu cũ == "null": cập nhật = new_type.
+             + Ngược lại: nếu khác nhau và chưa "mixed" => set "mixed" và cảnh báo.
+        """
+        if key not in self._full_schema:
+            self._full_schema[key] = new_type
+            self._processed_fields.add(key)
+            return
+        old_type = self._full_schema[key]
+        if old_type == new_type:
+            return
+        if new_type == "null":
+            return
+        if old_type == "null":
+            self._full_schema[key] = new_type
+            return
+        if old_type != "mixed":
+            self._full_schema[key] = "mixed"
+    # =====================================
+    # 3) Đệ quy trích xuất schema
+    # =====================================
+    def _extract_schema_from_obj(self, data: Dict[str, Any], prefix: str, item_index: int) -> None:
+        """
+        Duyệt dict hiện tại, cập nhật _full_schema với kiểu tại key (phẳng),
+        và nếu là object/array lồng thì đệ quy theo quy tắc gốc.
+        """
+        for key, value in data.items():
+            new_prefix = f"{prefix}{key}" if prefix else key
+            vtype = self.get_standard_type(value)
+            self._merge_type(new_prefix, vtype, item_index)
+            if isinstance(value, dict):
+                self._extract_schema_from_obj(value, f"{new_prefix}.", item_index)
+            elif isinstance(value, list) and value:
+                first = value[0]
+                if isinstance(first, dict):
+                    if self.list_policy == "first":
+                        self._extract_schema_from_obj(first, f"{new_prefix}.", item_index)
+                    else:  # union
+                        for elem in value:
+                            if isinstance(elem, dict):
+                                self._extract_schema_from_obj(elem, f"{new_prefix}.", item_index)
+                elif isinstance(first, list):
+                    if self.list_policy == "first":
+                        self._extract_schema_from_list(first, f"{new_prefix}.", item_index)
+                    else:
+                        for elem in value:
+                            if isinstance(elem, list):
+                                self._extract_schema_from_list(elem, f"{new_prefix}.", item_index)
+    def _extract_schema_from_list(self, data_list: List[Any], prefix: str, item_index: int) -> None:
+        """
+        Hỗ trợ cho trường hợp list lồng list (ít gặp). Duyệt tương tự _extract_schema_from_obj.
+        """
+        if not data_list:
+            return
+        first = data_list[0]
+        if isinstance(first, dict):
+            if self.list_policy == "first":
+                self._extract_schema_from_obj(first, prefix, item_index)
+            else:
+                for elem in data_list:
+                    if isinstance(elem, dict):
+                        self._extract_schema_from_obj(elem, prefix, item_index)
+        elif isinstance(first, list):
+            if self.list_policy == "first":
+                self._extract_schema_from_list(first, prefix, item_index)
+            else:
+                for elem in data_list:
+                    if isinstance(elem, list):
+                        self._extract_schema_from_list(elem, prefix, item_index)
+    # =====================================
+    # 4) API chính (data/file)
+    # =====================================
+    def create_schema_from_data(self, data: Any) -> Dict[str, str]:
+        """
+        Tạo schema từ biến Python (list | dict).
+        Giữ log giống bản gốc.
+        """
+        self._processed_fields.clear()
+        self._full_schema.clear()
+        data_list = data if isinstance(data, list) else [data]
+        if not data_list:
+            raise ValueError("JSON data is empty")
+        for i, item in enumerate(data_list, 1):
+            if not isinstance(item, dict):
+                continue
+            self._extract_schema_from_obj(item, prefix="", item_index=i)
+        return dict(self._full_schema)
+    def schemaRun(self, SegmentDict: str) -> Dict[str, str]:
+        SchemaDict = self.create_schema_from_data(SegmentDict)
+        return SchemaDict

Libraries/PDF_ExtractData.py ADDED Viewed

	@@ -0,0 +1,605 @@

+import re
+from typing import Dict, Any
+from collections import Counter, defaultdict
+from . import Common_TextProcess as TextProcess
+from . import Common_PdfProcess as PdfProcess
+# ===============================
+# 1. Utils  -> class U1_Utils
+# ===============================
+class U1_Utils:
+    # ===== Hàm tự động thu thập tên riêng =====
+    @staticmethod
+    def collect_proper_names(lines, min_count=10):
+        title_words = []
+        for line in lines:
+            text = line.get("Text", "")
+            words = re.findall(r"[A-Za-zÀ-ỹĐđ0-9]+", text)
+            if not words:
+                continue
+            # Bỏ qua từ đầu tiên
+            for w in words[1:]:
+                if w.istitle():
+                    clean_w = TextProcess.normalize_word(w)
+                    if clean_w:
+                        title_words.append(clean_w)
+        counter = Counter(title_words)
+        proper_names = {TextProcess.normalize_word(w) for w, cnt in counter.items() if cnt >= min_count}
+        return proper_names
+    @staticmethod
+    def extract_marker(text, patterns):
+        for pattern_info in patterns["markers"]:
+            match = pattern_info["pattern"].match(text)
+            if match:
+                marker_text = re.sub(r'^\s+', '', match.group(0))
+                marker_text = re.sub(r'\s+$', ' ', marker_text)
+                return {"marker_text": marker_text}
+        return {"marker_text": None}
+    @staticmethod
+    def format_marker(marker_text, patterns):
+        """
+        Chuẩn hoá MarkerText
+        """
+        if not marker_text:
+            return None
+        formatted = marker_text
+        formatted = re.sub(r'\b[0-9]+\b', '123', formatted)
+        formatted = re.sub(r'\b[IVXLC]+\b', 'XVI', formatted)
+        parts = re.split(r'(\W+)', formatted)
+        formatted_parts = []
+        for part in parts:
+            if re.match(r'(\W+)', part):
+                formatted_parts.append(part)
+                continue
+            if part.lower() in patterns["keywords_set"]:
+                formatted_parts.append(part)
+            elif re.match(r'^[a-z]$', part) or re.match(r'^[a-zđêôơư]$', part):
+                formatted_parts.append('abc')
+            elif re.match(r'^[A-Z]$', part) or re.match(r'^[A-ZĐÊÔƠƯ]$', part):
+                formatted_parts.append('ABC')
+            else:
+                formatted_parts.append(part)
+        return ''.join(formatted_parts)
+    # ===== Hàm chuẩn hoá số La Mã =====
+    @staticmethod
+    def normalizeRomans(lines, mode="marker", replace_with="ABC"):
+        format_groups = defaultdict(list)
+        for idx, line in enumerate(lines):
+            fmt = line.get("MarkerType")
+            marker = line.get("MarkerText")
+            if fmt and marker:
+                format_groups[fmt].append((idx, marker))
+        # --- kiểm tra MarkerType ---
+        if mode == "marker":
+            for fmt, group in format_groups.items():
+                roman_markers = []
+                for idx, marker in group:
+                    m = re.search(r'\b([IVXLC]+)\b', marker)
+                    if m and TextProcess.is_roman(m.group(1)):
+                        roman_markers.append((idx, m.group(1)))
+                    else:
+                        break
+                if roman_markers:
+                    roman_numbers = [TextProcess.roman_to_int(rm[1]) for rm in roman_markers]
+                    expected = list(range(min(roman_numbers), max(roman_numbers) + 1))
+                    if sorted(roman_numbers) != expected:
+                        for idx, _ in roman_markers:
+                            lines[idx]["MarkerType"] = re.sub(r'\b[IVXLC]+\b', replace_with, lines[idx]["MarkerType"])
+        # --- Chuẩn hoá toàn bộ Text/MarkerText ---
+        elif mode == "text":
+            for line in lines:
+                for key in ["Text", "MarkerText", "MarkerType"]:
+                    if line.get(key):
+                        line[key] = re.sub(r'\b[IVXLC]+\b', replace_with, line[key])
+        return lines
+# ===============================
+# 2. Word-level functions (mới) -> class U2_Word
+# ===============================
+class U2_Word:
+    @staticmethod
+    def caseStyle(word_text: str) -> int:
+        """CaseStyle cho từ: 3000 (UPPER), 2000 (Title), 1000 (khác)"""
+        clean = re.sub(r'[^A-Za-zÀ-ỹà-ỹ0-9]', '', word_text)
+        if clean and clean.isupper():
+            return 3000
+        if clean and clean.istitle():
+            return 2000
+        return 1000
+    @staticmethod
+    def buildStyle(word_text, span):
+        """Style gộp = CaseStyle + FontStyle (100,10,1)"""
+        cs = U2_Word.caseStyle(word_text)
+        b, i, u = PdfProcess.fontFlags(span)
+        fs = (100 if b else 0) + (10 if i else 0) + (1 if u else 0)
+        return cs + fs
+    @staticmethod
+    def getWordStyle(line, index: int):
+        """Lấy Style của từ tại vị trí index."""
+        words = PdfProcess.extractWords(line)
+        if -len(words) <= index < len(words):
+            word, span = words[index]
+            return U2_Word.buildStyle(word, span)
+        return 0
+# ===============================
+# 3. Line-level functions (mới) -> class U3_Line
+# ===============================
+class U3_Line:
+    @staticmethod
+    def getPageGeneralSize(page):
+        """[height, width] của trang"""
+        return [round(page.rect.height, 1), round(page.rect.width, 1)]
+    @staticmethod
+    def getLineText(line):
+        """Text đầy đủ của line"""
+        return line.get("text", "")
+    @staticmethod
+    def getLineStyle(line, exceptions=None):
+        """
+        Style của line = CaseStyle (min trên từ hợp lệ) + FontStyle (AND spans).
+        """
+        words = line.get("words", [])
+        spans = line.get("spans", [])
+        # Gom exceptions
+        exception_texts = set()
+        if exceptions:
+            exception_texts = (
+                set(exceptions.get("common_words", [])) |
+                set(exceptions.get("proper_names", [])) |
+                set(exceptions.get("abbreviations", []))
+            )
+        # ===== CaseStyle =====
+        cs_values = []
+        for w, _ in words:
+            clean_w = TextProcess.normalize_word(w)
+            if not clean_w:
+                continue
+            if clean_w in exception_texts or TextProcess.is_abbreviation(clean_w):
+                continue
+            cs_values.append(U2_Word.caseStyle(clean_w))
+        cs_line = min(cs_values) if cs_values else 1000
+        # ===== FontStyle =====
+        if spans:
+            bold_all = italic_all = underline_all = True
+            for s in spans:
+                b, i, u = PdfProcess.fontFlags(s)
+                bold_all &= b
+                italic_all &= i
+                underline_all &= u
+            fs_line = (100 if bold_all else 0) + (10 if italic_all else 0) + (1 if underline_all else 0)
+        else:
+            fs_line = 0
+        return cs_line + fs_line
+# ===============================
+# 4. Compatibility wrappers -> class U4_Compat
+# ===============================
+class U4_Compat:
+    @staticmethod
+    def getText(line):
+        """Alias cũ: Text của line"""
+        return U3_Line.getLineText(line)
+    @staticmethod
+    def getCoords(line):
+        """Alias cũ: Coord của line, giữ tuple (x0, x1, xm, y0, y1)"""
+        return PdfProcess.getLineCoord(line)
+    @staticmethod
+    def getFirstWord(line):
+        """Giữ API cũ: trả {Text, Style, FontSize} của từ đầu"""
+        return {
+            "Text": PdfProcess.getWordText(line, 0),
+            "Style": U2_Word.getWordStyle(line, 0),
+            "FontSize": PdfProcess.getWordFontSize(line, 0),
+        }
+    @staticmethod
+    def getLastWord(line):
+        """Giữ API cũ: trả {Text, Style, FontSize} của từ cuối"""
+        return {
+            "Text": PdfProcess.getWordText(line, -1),
+            "Style": U2_Word.getWordStyle(line, -1),
+            "FontSize": PdfProcess.getWordFontSize(line, -1),
+        }
+# ===============================
+# 5. Marker / Style (line-level) -> class U5_MarkerStyle
+# ===============================
+class U5_MarkerStyle:
+    @staticmethod
+    def getMarker(text, patterns):
+        info = U1_Utils.extract_marker(text, patterns)
+        marker_text = info.get("marker_text")
+        marker_type = None
+        if marker_text:
+            # Giữ sửa lỗi xử lý dấu '+'
+            marker_text_cleaned = re.sub(r'([A-Za-z0-9ĐÊÔƠƯđêôơư])\+(?=\W|$)', r'\1', marker_text)
+            marker_type = U1_Utils.format_marker(marker_text_cleaned, patterns)
+        return marker_text, marker_type
+    @staticmethod
+    def getFontSize(line):
+        """
+        Mean FontSize trên spans (logic cũ) — vẫn giữ cho compatibility nếu còn chỗ gọi.
+        """
+        spans = line.get("spans", [])
+        if spans:
+            valid_spans = [s for s in spans if s.get("text", "").strip()]
+            if valid_spans:
+                sizes = [s.get("size", 12.0) for s in valid_spans]
+            else:
+                sizes = [s.get("size", 12.0) for s in spans]
+            avg = sum(sizes) / len(sizes)
+            return round(avg * 2) / 2
+        return 12.0
+# ===============================
+# 6. Tổng hợp toàn văn bản -> class U6_Document
+# ===============================
+class U6_Document:
+    @staticmethod
+    def getTextStatus(pdf_doc, exceptions, patterns):
+        doc = pdf_doc
+        general = {"pageGeneralSize": U3_Line.getPageGeneralSize(doc[0])}
+        lines = []
+        for i, page in enumerate(doc):
+            text_dict = page.get_text("dict")
+            for block in text_dict["blocks"]:
+                if "lines" in block:
+                    for l in block["lines"]:
+                        text = "".join(span["text"] for span in l["spans"]).strip()
+                        if not text:
+                            continue
+                        # Marker
+                        marker_text, marker_type = U5_MarkerStyle.getMarker(text, patterns)
+                        # Style/FontSize/Coord
+                        line_obj = {"text": text, "spans": l["spans"]}
+                        style = U3_Line.getLineStyle(line_obj)
+                        fontsize = PdfProcess.getLineFontSize(line_obj)
+                        x0, x1, xm, y0, y1 = PdfProcess.getLineCoord(line_obj)
+                        # Words
+                        words_obj = {
+                            "First": U4_Compat.getFirstWord(line_obj),
+                            "Last":  U4_Compat.getLastWord(line_obj)
+                        }
+                        line_dict = {
+                            "Line": len(lines) + 1,
+                            "Text": text,
+                            "MarkerText": marker_text,
+                            "MarkerType": marker_type,
+                            "Style": style,
+                            "FontSize": fontsize,
+                            "Words": words_obj,
+                            "Coords": {"X0": x0, "X1": x1, "XM": xm, "Y0": y0, "Y1": y1}
+                        }
+                        lines.append(line_dict)
+        return {"general": general, "lines": lines}
+# ===============================
+# 7. Các hàm set* -> class U7_Setters
+# ===============================
+class U7_Setters:
+    @staticmethod
+    def setCommonStatus(lines, attr, rank=1):
+        values = [l[attr] for l in lines if l.get(attr) is not None]
+        counter = Counter(values)
+        return counter.most_common(rank)
+    @staticmethod
+    def setCommonFontSize(lines):
+        fs, _ = U7_Setters.setCommonStatus(lines, "FontSize", 1)[0]
+        return round(fs, 1)
+    @staticmethod
+    def setCommonFontSizes(lines):
+        """
+        Trả về tất cả FontSize và số lượng của chúng, sắp xếp theo tần suất giảm dần.
+        """
+        values = [l["FontSize"] for l in lines if l.get("FontSize") is not None]
+        counter = Counter(values)
+        results = []
+        for fs, count in counter.most_common():  # trả về tất cả
+            results.append({"FontSize": round(fs, 1), "Count": count})
+        return results
+    @staticmethod
+    def setCommonMarkers(lines):
+        total = len(lines)
+        counter = Counter([l["MarkerType"] for l in lines if l["MarkerType"]])
+        results = []
+        for marker, count in counter.most_common(10):
+            if count >= total * 0.005:
+                results.append(marker)
+            else:
+                break
+        return results
+    @staticmethod
+    def setTextStatus(baseJson):
+        lines = baseJson["lines"]
+        pageGeneralSize = baseJson["general"]["pageGeneralSize"]
+        xStart, yStart, xEnd, yEnd, xMid, yMid = PdfProcess.setPageCoords(lines, pageGeneralSize)
+        regionWidth, regionHeight = PdfProcess.setPageRegionSize(xStart, yStart, xEnd, yEnd)
+        commonFontSizes = U7_Setters.setCommonFontSizes(lines)
+        commonFontSize = U7_Setters.setCommonFontSize(lines)
+        commonMarkers = U7_Setters.setCommonMarkers(lines)
+        new_general = {
+            "pageGeneralSize": baseJson["general"]["pageGeneralSize"],
+            "pageCoords": {"xStart": xStart, "yStart": yStart, "xEnd": xEnd, "yEnd": yEnd, "xMid": xMid, "yMid": yMid},
+            "pageRegionWidth": regionWidth,
+            "pageRegionHeight": regionHeight,
+            "commonFontSize": commonFontSize,
+            "commonFontSizes": commonFontSizes,
+            "commonMarkers": commonMarkers
+        }
+        new_lines = []
+        for i, line in enumerate(lines):
+            lineWidth, lineHeight = PdfProcess.setLineSize(line)
+            pos = PdfProcess.setPosition(line, lines[i - 1] if i > 0 else None,
+                              lines[i + 1] if i < len(lines) - 1 else None,
+                              xStart, xEnd, xMid)
+            pos_dict = {"Left": pos[0], "Right": pos[1], "Mid": pos[2], "Top": pos[3], "Bot": pos[4]}
+            line_dict = {
+                **line,
+                "LineWidth": lineWidth,
+                "LineHeight": lineHeight,
+                "Position": pos_dict,
+                "Align": PdfProcess.setAlign(pos_dict, regionWidth)
+            }
+            new_lines.append(line_dict)
+        return {"general": new_general, "lines": new_lines}
+# ===============================
+# 8. Các hàm del/reset -> class U8_Cleanup
+# ===============================
+class U8_Cleanup:
+    @staticmethod
+    def delStatus(jsonDict, deleteList):
+        for line in jsonDict["lines"]:
+            for attr in deleteList:
+                if attr in line:
+                    del line[attr]
+        return jsonDict
+    @staticmethod
+    def resetPosition(jsonDict):
+        lines = jsonDict.get("lines", [])
+        for i, line in enumerate(lines):
+            pos = line.get("Position", {})
+            if "Top" in pos and pos["Top"] < 0:
+                top_candidates = []
+                if i > 0:
+                    prev_top = lines[i - 1].get("Position", {}).get("Top")
+                    if prev_top is not None:
+                        top_candidates.append(prev_top)
+                if i < len(lines) - 1:
+                    next_top = lines[i + 1].get("Position", {}).get("Top")
+                    if next_top is not None:
+                        top_candidates.append(next_top)
+                if top_candidates:
+                    pos["Top"] = min(top_candidates)
+            if "Bot" in pos and pos["Bot"] < 0:
+                bot_candidates = []
+                if i > 0:
+                    prev_bot = lines[i - 1].get("Position", {}).get("Bot")
+                    if prev_bot is not None:
+                        bot_candidates.append(prev_bot)
+                if i < len(lines) - 1:
+                    next_bot = lines[i + 1].get("Position", {}).get("Bot")
+                    if next_bot is not None:
+                        bot_candidates.append(next_bot)
+                if bot_candidates:
+                    pos["Bot"] = min(bot_candidates)
+            line["Position"] = pos
+        return jsonDict
+    @staticmethod
+    def normalizeFinal(jsonDict):
+        for line in jsonDict.get("lines", []):
+            # xử lý Text và MarkerText
+            if "Text" in line:
+                line["Text"] = TextProcess.strip_extra_spaces(line["Text"])
+            if "MarkerText" in line and line["MarkerText"]:
+                line["MarkerText"] = TextProcess.strip_extra_spaces(line["MarkerText"])
+            # xử lý word-level
+            words = line.get("Words", {})
+            for key in ["First", "Last"]:
+                if key in words and "Text" in words[key]:
+                    words[key]["Text"] = TextProcess.strip_extra_spaces(words[key]["Text"])
+        return jsonDict
+# ===============================
+# 9. Hàm chính extractData (giữ API cũ)
+# ===============================
+def extractData(pdf_doc, exceptData, markerData, statusData):
+    # ===== 1. Load JSON theo format đồng bộ =====
+    exceptions = dict(exceptData)
+    markers = dict(markerData)
+    status = dict(statusData)
+    # ===== 2. Biên dịch markers =====
+    keywords = markers.get("keywords", [])
+    title_keywords = '|'.join(re.escape(k[0].upper() + k[1:].lower()) for k in keywords)
+    upper_keywords = '|'.join(re.escape(k.upper()) for k in keywords)
+    all_keywords = f"{title_keywords}|{upper_keywords}"
+    compiled_markers = []
+    for item in markers.get("markers", []):
+        pattern_str = item["pattern"].replace("{keywords}", all_keywords)
+        try:
+            compiled_pattern = re.compile(pattern_str)
+        except re.error:
+            continue
+        compiled_markers.append({
+            "pattern": compiled_pattern,
+            "description": item.get("description", ""),
+            "type": item.get("type", "")
+        })
+    patterns = {
+        "markers": compiled_markers,
+        "keywords_set": set(k.lower() for k in keywords)
+    }
+    # ===== 3. Xử lý PDF =====
+    baseJson = U6_Document.getTextStatus(pdf_doc, exceptions, patterns)
+    baseJson["lines"] = U1_Utils.normalizeRomans(baseJson["lines"])
+    modifiedJson = U7_Setters.setTextStatus(baseJson)
+    cleanJson = U8_Cleanup.resetPosition(modifiedJson)
+    extractedData = U8_Cleanup.delStatus(cleanJson, ["Coords"])
+    extractedData = U8_Cleanup.normalizeFinal(extractedData)
+    # ===== 4. Bổ sung tên riêng động =====
+    proper_names_auto = U1_Utils.collect_proper_names(extractedData["lines"], min_count=10)
+    proper_names_existing = [p["text"] if isinstance(p, dict) else str(p)
+                                for p in exceptions.get("proper_names", [])]
+    exceptions["proper_names"] = list(set(proper_names_existing) | proper_names_auto)
+    return extractedData
+class B1Extractor:
+    """
+    Orchestrator theo instance:
+    - Giữ nguyên quy tắc/thuật toán của extractData cũ.
+    - exceptions/markers/status và regex markers được nạp/biên dịch 1 lần.
+    """
+    def __init__(
+        self,
+        exceptData: Any,
+        markerData: Any,
+        statusData: Any,
+        proper_name_min_count: int = 10,
+    ) -> None:
+        """
+        exceptData / markerData / statusData:
+          - str: đường dẫn tới JSON theo format đồng bộ (U1_Utils.loadHardcodes)
+          - dict: dữ liệu đã load sẵn (bỏ qua loadHardcodes)
+        proper_name_min_count:
+          - Ngưỡng đếm tên riêng động.
+        """
+        # ---- 1) Nạp exceptions/markers/status (không đổi format) ----
+        def _ensure_dict(src, wanted=None):
+            if isinstance(src, dict):
+                return dict(src)
+            raise ValueError("Vui lòng truyền dict đã load sẵn thay vì đường dẫn file.")
+        self.exceptions: Dict[str, Any] = _ensure_dict(
+            exceptData, wanted=["common_words", "proper_names", "abbreviations"]
+        )
+        self.markers: Dict[str, Any] = _ensure_dict(
+            markerData, wanted=["keywords", "markers"]
+        )
+        self.status: Dict[str, Any] = _ensure_dict(statusData)
+        self.proper_name_min_count = proper_name_min_count
+        # ---- 2) Biên dịch markers (y như logic cũ) ----
+        keywords = self.markers.get("keywords", [])
+        title_keywords = "|".join(re.escape(k[0].upper() + k[1:].lower()) for k in keywords)
+        upper_keywords = "|".join(re.escape(k.upper()) for k in keywords)
+        all_keywords = f"{title_keywords}|{upper_keywords}" if keywords else ""
+        compiled_markers = []
+        for item in self.markers.get("markers", []):
+            pattern_str = item.get("pattern", "")
+            if all_keywords:
+                pattern_str = pattern_str.replace("{keywords}", all_keywords)
+            try:
+                compiled = re.compile(pattern_str)
+            except re.error:
+                continue
+            compiled_markers.append(
+                {
+                    "pattern": compiled,
+                    "description": item.get("description", ""),
+                    "type": item.get("type", ""),
+                }
+            )
+        self.patterns = {
+            "markers": compiled_markers,
+            "keywords_set": set(k.lower() for k in keywords),
+        }
+    # ---------- Public API ----------
+    def extract(self, pdf_doc) -> Dict[str, Any]:
+        """
+        Chạy pipeline extractData cũ cho 1 file PDF.
+        Trả về extractedData (như trước).
+        """
+        # ===== 3) Trích xuất text & thuộc tính dòng từ PDF =====
+        baseJson = U6_Document.getTextStatus(pdf_doc, self.exceptions, self.patterns)
+        # Chuẩn hoá số La Mã (giữ nguyên quy tắc)
+        baseJson["lines"] = U1_Utils.normalizeRomans(baseJson["lines"])
+        # ===== 4) Tính toán status/position/align (giữ nguyên) =====
+        modifiedJson = U7_Setters.setTextStatus(baseJson)
+        cleanJson = U8_Cleanup.resetPosition(modifiedJson)
+        extractedData = U8_Cleanup.delStatus(cleanJson, ["Coords"])
+        extractedData = U8_Cleanup.normalizeFinal(extractedData)
+        # ===== 5) Bổ sung proper_names động (giữ nguyên tinh thần) =====
+        proper_names_auto = U1_Utils.collect_proper_names(
+            extractedData["lines"], min_count=self.proper_name_min_count
+        )
+        proper_names_existing = [
+            p["text"] if isinstance(p, dict) else str(p)
+            for p in self.exceptions.get("proper_names", [])
+        ]
+        # Cập nhật vào trạng thái của instance (để chạy nhiều file liên tiếp vẫn tích lũy)
+        self.exceptions["proper_names"] = list(set(proper_names_existing) | proper_names_auto)
+        return extractedData

Libraries/PDF_MergeData.py ADDED Viewed

	@@ -0,0 +1,283 @@

+from collections import Counter
+from statistics import mean, multimode
+# ===============================
+# HÀM CHÍNH
+# ===============================
+def mergeLinesToParagraphs(baseJson):
+    """
+    Nhận vào JSON sau extractData (lines-level)
+    Trả về JSON mới (paragraph-level)
+    """
+    general = baseJson["general"]
+    lines = baseJson["lines"]
+    paragraphs = []
+    buffer = []
+    for i, curr in enumerate(lines):
+        if not buffer:
+            buffer.append(curr)
+            continue
+        prev = lines[i-1]
+        if canMerge(prev, curr, i-1, i):
+            buffer.append(curr)
+        else:
+            paragraphs.append(buildParagraph(buffer, len(paragraphs)+1, general))
+            buffer = [curr]
+    if buffer:
+        paragraphs.append(buildParagraph(buffer, len(paragraphs)+1, general))
+    merged = {"general": general, "paragraphs": paragraphs}
+    # >>> TÍNH LẠI 'common' TRONG GENERAL DỰA TRÊN PARAGRAPHS
+    merged = recomputeCommonsInGeneralAfterMerge(merged)
+    return {"general": general, "paragraphs": paragraphs}
+# ===============================
+# CÁC HÀM ĐIỀU KIỆN MERGE
+# ===============================
+def canMerge(prev, curr, idx_prev=None, idx_curr=None):
+    """
+    Kiểm tra line curr có thể merge vào prev không
+    Ghi log lý do True/False
+    """
+    pair = f"[{idx_prev+1}->{idx_curr+1}]" if idx_prev is not None else ""
+    if isNewPara(curr):
+        return False
+    if not isSameFontSize(prev, curr):
+        return False
+    if not isSameStyle(prev, curr):
+        return False
+    if not isNear(prev, curr):
+        return False
+    if isSameAlign(prev, curr):
+        return True
+    if isBadAlign(prev, curr):
+        return False
+    if canMergeWithAlign(prev) or canMergeWithLeft(prev, curr):
+        return True
+    print(f"{pair} Merge=False | Reason: Fallback")
+    return False
+# Check MarkerText
+def isNewPara(line):
+    return line.get("MarkerText") not in (None, "", " ")
+# Check FontSize
+def isSameFontSize(prev, curr):
+    return abs(prev["FontSize"] - curr["FontSize"]) <= 0.7
+# Check Style
+def isSameStyle(prev, curr):
+    return isSameLineStyle(prev, curr) or isSameFirstStyle(prev, curr) or isSameLastStyle(prev, curr) or isSameWordStyle(prev, curr)
+def isSameFStyle(prev, curr):
+    return isSameLineFStyle(prev, curr) or isSameFirstFStyle(prev, curr) or isSameLastFStyle(prev, curr) or isSameWordFStyle(prev, curr)
+def isSameCase(prev, curr):
+    return isSameLineCase(prev, curr) or isSameFirstCase(prev, curr) or isSameLastCase(prev, curr) or isSameWordCase(prev, curr)
+# Line - Line
+def isSameLineStyle(prev, curr):
+    return prev["Style"] == curr["Style"]
+def isSameLineFStyle(prev, curr):
+    return prev["Style"] %1000 == curr["Style"] %1000
+def isSameLineCase(prev, curr):
+    return prev["Style"] /1000 == curr["Style"] /1000
+# First - Line
+def isSameFirstStyle(prev, curr):
+    return prev["Style"] == curr["Words"]["First"]["Style"]
+def isSameFirstFStyle(prev, curr):
+    return prev["Style"] %1000 == curr["Words"]["First"]["Style"] %1000
+def isSameFirstCase(prev, curr):
+    return prev["Style"] /1000 == curr["Words"]["First"]["Style"] /1000
+# Last - Line
+def isSameLastStyle(prev, curr):
+    return prev["Words"]["Last"]["Style"] == curr["Style"]
+def isSameLastFStyle(prev, curr):
+    return prev["Words"]["Last"]["Style"] %1000 == curr["Style"] %1000
+def isSameLastCase(prev, curr):
+    return prev["Words"]["Last"]["Style"] /1000 == curr["Style"] /1000
+# Last - First
+def isSameWordStyle(prev, curr):
+    return prev["Words"]["Last"]["Style"] == curr["Words"]["First"]["Style"]
+def isSameWordFStyle(prev, curr):
+    return prev["Words"]["Last"]["Style"] %1000 == curr["Words"]["First"]["Style"] %1000
+def isSameWordCase(prev, curr):
+    return prev["Words"]["Last"]["Style"] /1000 == curr["Words"]["First"]["Style"] /1000
+# Linespace
+def isNear(prev, curr):
+    if "Position" not in prev or "Position" not in curr:
+        return False
+    if "LineHeight" not in curr:
+        return False
+    hig_curr = curr["LineHeight"]
+    top_prev = prev["Position"]["Top"]
+    top_curr = curr["Position"]["Top"]
+    bot_curr = curr["Position"]["Bot"]
+    return (top_curr < top_prev * 2) and ((top_curr < bot_curr * 2) or bot_curr <= 3.0) and (top_curr < hig_curr * 5)
+def isSameAlign(prev, curr):
+    return prev.get("Align") == curr.get("Align")
+def isBadAlign(prev, curr):
+    return (prev.get("Align") != "right" and curr.get("Align") == "right")
+def isNoSameAlign0(prev):
+    return prev.get("Align") == "Justify"
+def isNoSameAlignC(prev):
+    return prev.get("Align") == "Center"
+def isNoSameAlignR(prev):
+    return prev.get("Align") == "Right"
+def isNoSameAlignL(prev, curr):
+    return prev.get("Align") == "Left" and curr.get("Align") == "Justify"
+def canMergeWithAlign(prev):
+    return isNoSameAlign0(prev) or isNoSameAlignC(prev) or isNoSameAlignR(prev)
+def canMergeWithLeft(prev, curr):
+    return isNoSameAlignL(prev, curr)
+# ===============================
+# HÀM BUILD PARAGRAPH
+# ===============================
+def buildParagraph(lines, para_id, general=None):
+    """
+    Tạo dict Paragraph từ list lines đã merge
+    """
+    text = " ".join([ln["Text"] for ln in lines])
+    marker_text = lines[0]["MarkerText"]
+    marker_type = lines[0]["MarkerType"]
+    # Style: lấy min theo từng chữ số
+    style = mergeStyle([ln["Style"] for ln in lines])
+    # first_word = lines[0]["Words"]["First"]
+    # last_word = lines[-1]["Words"]["Last"]
+    fs_values = [ln["FontSize"] for ln in lines if ln.get("FontSize") is not None]
+    if fs_values:
+        modes = multimode(fs_values)  # trả về list tất cả các mode
+        if len(modes) == 1:
+            font_size = modes[0]
+        else:
+            # có nhiều mode → chọn gần với commonFontSize trong general
+            if general and general.get("commonFontSize") is not None:
+                target = general["commonFontSize"]
+                font_size = min(modes, key=lambda x: abs(x - target))
+            else:
+                font_size = mean(fs_values)
+        font_size = round(font_size, 1)
+    else:
+        font_size = 12.0
+    align = mostCommon([ln["Align"] for ln in lines]) or lines[-1]["Align"]
+    return {
+        "Paragraph": para_id,
+        "Text": text,
+        "MarkerText": marker_text,
+        "MarkerType": marker_type,
+        "Style": style,
+        "FontSize": font_size,
+        "Align": align,
+    }
+# ===============================
+# HELPERS
+# ===============================
+def mergeStyle(styles):
+    """
+    styles: list số 4 chữ số (CaseStyle*1000 + FontStyle)
+    - Lấy min của từng chữ số
+    """
+    digits = [list(str(s).zfill(4)) for s in styles]
+    min_digits = [min(int(d[i]) for d in digits) for i in range(4)]
+    return int("".join(str(d) for d in min_digits))
+def mostCommon(values):
+    if not values:
+        return None
+    count = Counter(values)
+    most = count.most_common(1)
+    return most[0][0] if most else None
+# ===============================
+# RESOLVE COMMONS
+# ===============================
+def recomputeCommonsInGeneralAfterMerge(mergedJson):
+    """
+    Cập nhật lại các 'common' trong mergedJson['general'] dựa trên danh sách paragraphs.
+    Các field cập nhật:
+      - commonFontSize
+      - commonFontSizes: [{FontSize, Count}, ...] (giảm dần theo Count)
+      - commonMarkers: top marker thỏa ngưỡng >= 0.5% tổng số paragraph, tối đa 10 mục
+    """
+    paragraphs = mergedJson.get("paragraphs", [])
+    total = len(paragraphs)
+    # --- Font sizes ---
+    fs_values = [p["FontSize"] for p in paragraphs if p.get("FontSize") is not None]
+    fs_counter = Counter(fs_values)
+    commonFontSizes = [{"FontSize": round(fs, 1), "Count": cnt}
+                       for fs, cnt in fs_counter.most_common()]
+    commonFontSize = commonFontSizes[0]["FontSize"] if commonFontSizes else None
+    # --- Markers ---
+    mk_values = [p["MarkerType"] for p in paragraphs if p.get("MarkerType")]
+    mk_counter = Counter(mk_values)
+    threshold = max(1, int(total * 0.005))
+    commonMarkers = [m for m, c in mk_counter.most_common(10) if c >= threshold]
+    # --- Ghi đè vào general ---
+    mergedJson["general"].update({
+        "commonFontSize": commonFontSize,
+        "commonFontSizes": commonFontSizes,
+        "commonMarkers": commonMarkers
+    })
+    return mergedJson

Libraries/PDF_QualityCheck.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import re
+import fitz
+from typing import Dict, Tuple, Union
+class PDFQualityChecker:
+    """
+    Bộ lọc chất lượng PDF cơ bản trước khi xử lý.
+    Đánh giá lỗi font, lỗi encode, ký tự hỏng, OCR kém, v.v.
+    """
+    def __init__(self,
+                 max_invalid_ratio: float = 0.2,
+                 max_whitespace_ratio: float = 0.2,
+                 max_short_line_ratio: float = 0.3,
+                 min_total_chars: int = 300):
+        self.max_invalid_ratio = max_invalid_ratio
+        self.max_whitespace_ratio = max_whitespace_ratio
+        self.max_short_line_ratio = max_short_line_ratio
+        self.min_total_chars = min_total_chars
+        # Regex nhận diện ký tự hợp lệ (chữ, số, dấu tiếng Việt, ký hiệu cơ bản)
+        self.valid_char_pattern = re.compile(r"[A-Za-zÀ-ỹĐđ0-9.,:;!?()\"'’”“–\-_\s]")
+    # ============================================================
+    # 1️⃣  HÀM CHÍNH
+    # ============================================================
+    def evaluate(self, pdf: Union[str, fitz.Document]) -> Tuple[bool, Dict]:
+        """
+        Đánh giá chất lượng PDF.
+        - pdf: đường dẫn (str) hoặc fitz.Document đã mở
+        - trả (is_good, metrics)
+        """
+        # ---- Chuẩn hóa input ----
+        if isinstance(pdf, str):
+            try:
+                doc = fitz.open(pdf)
+            except Exception as e:
+                return False, {"check_mess": f"❌ Không mở được file: {e}"}
+        elif isinstance(pdf, fitz.Document):
+            doc = pdf
+        else:
+            raise TypeError("pdf phải là str hoặc fitz.Document")
+        # ---- Bắt đầu thống kê ----
+        text_all = ""
+        short_lines = 0
+        all_lines = 0
+        for page in doc:
+            text = page.get_text("text") or ""
+            if not text.strip():
+                continue
+            lines = text.splitlines()
+            for line in lines:
+                if not line.strip():
+                    continue
+                all_lines += 1
+                if len(line.strip()) < 10:
+                    short_lines += 1
+            text_all += text + "\n"
+        total_chars = len(text_all)
+        if total_chars < self.min_total_chars:
+            return False, {
+                "check_mess": "❌ File quá ngắn hoặc không có text layer",
+                "total_chars": total_chars,
+            }
+        # ---- Tính tỷ lệ lỗi ----
+        valid_chars = sum(1 for ch in text_all if self.valid_char_pattern.match(ch))
+        invalid_chars = total_chars - valid_chars
+        invalid_ratio = invalid_chars / total_chars
+        whitespace_excess = len(re.findall(r" {3,}", text_all))
+        whitespace_ratio = whitespace_excess / total_chars
+        short_line_ratio = short_lines / max(all_lines, 1)
+        # ---- Đưa ra kết luận ----
+        is_good = (
+            invalid_ratio <= self.max_invalid_ratio
+            and whitespace_ratio <= self.max_whitespace_ratio
+            and short_line_ratio < 1
+        )
+        if not is_good:
+            if invalid_ratio > self.max_invalid_ratio:
+                check_mess = "❌ Nhiều ký tự lỗi / encode sai"
+            elif whitespace_ratio > self.max_whitespace_ratio:
+                check_mess = "❌ Nhiều khoảng trắng thừa"
+            elif short_line_ratio >= 1:
+                check_mess = "⚠️ OCR hoặc mất ký tự"
+            else:
+                check_mess = "❌ Văn bản lỗi nặng"
+        else:
+            check_mess = "✅ Đạt yêu cầu"
+        metrics = {
+            "check_mess": check_mess,
+            "total_chars": total_chars,
+            "invalid_ratio": round(invalid_ratio, 3),
+            "whitespace_ratio": round(whitespace_ratio, 3),
+            "short_line_ratio": round(short_line_ratio, 3),
+        }
+        return is_good, metrics

Libraries/Summarizer_Runner.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+from typing import Dict
+from . import Json_ChunkUnder
+class RecursiveSummarizer:
+    """
+    Bộ tóm tắt học thuật tiếng Việt theo hướng:
+    Extractive (chunk semantic) + Abstractive (recursive summarization)
+    """
+    def __init__(
+        self,
+        tokenizer,
+        summarizer,
+        sum_device: str,
+        chunk_builder: Json_ChunkUnder.ChunkUndertheseaBuilder,
+        max_length: int = 256,
+        min_length: int = 64,
+        max_depth: int = 5
+    ):
+        """
+        tokenizer: AutoTokenizer đã load sẵn.
+        summarizer: AutoModelForSeq2SeqLM (ViT5 / BartPho / mT5)
+        sum_device: 'cuda' hoặc 'cpu'
+        chunk_builder: ChunkUndertheseaBuilder instance.
+        """
+        self.tokenizer = tokenizer
+        self.model = summarizer
+        self.device = sum_device
+        self.chunk_builder = chunk_builder
+        self.max_length = max_length
+        self.min_length = min_length
+        self.max_depth = max_depth
+    # ============================================================
+    # 1️⃣ Hàm tóm tắt 1 đoạn
+    # ============================================================
+    def summarize_single(self, text: str) -> str:
+        """
+        Tóm tắt 1 đoạn đơn bằng mô hình abstractive (ViT5/BartPho).
+        """
+        if not text or len(text.strip()) == 0:
+            return ""
+        if "vit5" in str(self.model.__class__).lower():
+            input_text = f"vietnews: {text.strip()} </s>"
+        else:
+            input_text = text.strip()
+        try:
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=1024
+            ).to(self.device)
+            with torch.no_grad():
+                summary_ids = self.model.generate(
+                    **inputs,
+                    max_length=self.max_length,
+                    min_length=self.min_length,
+                    num_beams=4,
+                    no_repeat_ngram_size=3,
+                    early_stopping=True
+                )
+            summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+            return summary.strip()
+        except torch.cuda.OutOfMemoryError:
+            print("⚠️ GPU OOM – fallback sang CPU.")
+            self.model = self.model.to("cpu")
+            inputs = inputs.to("cpu")
+            with torch.no_grad():
+                summary_ids = self.model.generate(
+                    **inputs,
+                    max_length=self.max_length,
+                    min_length=self.min_length,
+                    num_beams=4
+                )
+            return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
+        except Exception as e:
+            print(f"❌ Lỗi khi tóm tắt đoạn: {e}")
+            return ""
+    # ============================================================
+    # 2️⃣ Đệ quy tóm tắt văn bản dài
+    # ============================================================
+    def summarize_recursive(self, text: str, depth: int = 0, minInput: int = 256, maxInput: int = 1024) -> str:
+        """
+        Đệ quy tóm tắt văn bản dài:
+        - <256 từ: giữ nguyên
+        - <1024 từ: tóm tắt trực tiếp
+        - >=1024 từ: chia chunk + tóm tắt từng phần → gộp → đệ quy
+        """
+        word_count = len(text.split())
+        indent = "  " * depth
+        print(f"{indent}🔹 Level {depth}: {word_count} từ")
+        # 1️⃣ Văn bản ngắn
+        if word_count < minInput:
+            return self.summarize_single(text)
+        else:
+            chunks = self.chunk_builder.build(text)
+            summaries = []
+            for item in chunks:
+                content = item.get("Content", "")
+                print(content)
+                idx = item.get("Index", "?")
+                wc = len(content.split())
+                if wc < 20:
+                    print(f"{indent}⚠️ Bỏ qua chunk {idx} (quá ngắn)")
+                    continue
+                print(f"{indent}🔸 Chunk {idx}: {wc} từ")
+                sub_summary = self.summarize_single(content)
+                if sub_summary:
+                    summaries.append(sub_summary)
+            merged_summary = "\n".join(summaries)
+            merged_len = len(merged_summary.split())
+            print(f"{indent}🔁 Gộp {len(summaries)} summary → {merged_len} từ")
+            # Đệ quy nếu vẫn dài
+            if merged_len > 1024 and depth < self.max_depth:
+                return self.summarize_recursive(merged_summary, depth + 1)
+            else:
+                return merged_summary
+    # ============================================================
+    # 3️⃣ Hàm chính cho người dùng
+    # ============================================================
+    def summarize(self, full_text: str, minInput: int = 256, maxInput: int = 1024) -> Dict[str, str]:
+        """
+        Giao diện chính:
+        - Nhận text dài
+        - Tự động chia chunk, tóm tắt, gộp
+        - Trả về dict gồm summary và thống kê
+        """
+        original_len = len(full_text.split())
+        summary = self.summarize_recursive(full_text, depth = 0, minInput = minInput, maxInput = maxInput)
+        summary_len = len(summary.split())
+        ratio = round(summary_len / original_len, 3) if original_len else 0
+        print(f"\n✨ FINAL SUMMARY ({summary_len}/{original_len} từ, r={ratio}) ✨")
+        return {
+            "summary_text": summary,
+            "original_words": original_len,
+            "summary_words": summary_len,
+            "compression_ratio": ratio
+        }

Libraries/Summarizer_Trainer.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import os
+import numpy as np
+import pandas as pd
+import json
+from typing import Optional, Union
+import evaluate
+from datasets import Dataset, DatasetDict, load_from_disk
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    EarlyStoppingCallback,
+    set_seed,
+)
+class SummarizationTrainer:
+    """
+    Fine-tune mô hình tóm tắt (Seq2Seq) đa dụng — thống nhất interface:
+    run(Checkpoint, ModelPath, DataPath | dataset, tokenizer)
+    """
+    def __init__(
+        self,
+        Max_Input_Length: int = 1024,
+        Max_Target_Length: int = 256,
+        prefix: str = "",
+        input_column: str = "article",
+        target_column: str = "summary",
+        Learning_Rate: float = 3e-5,
+        Weight_Decay: float = 0.01,
+        Batch_Size: int = 8,
+        Num_Train_Epochs: int = 3,
+        gradient_accumulation_steps: int = 1,
+        warmup_ratio: float = 0.05,
+        lr_scheduler_type: str = "linear",
+        seed: int = 42,
+        num_beams: int = 4,
+        generation_max_length: Optional[int] = None,
+        fp16: bool = True,
+        early_stopping_patience: int = 2,
+        logging_steps: int = 200,
+        report_to: str = "none",
+    ):
+        # Hyperparams
+        self.Max_Input_Length = Max_Input_Length
+        self.Max_Target_Length = Max_Target_Length
+        self.prefix = prefix
+        self.input_column = input_column
+        self.target_column = target_column
+        self.Learning_Rate = Learning_Rate
+        self.Weight_Decay = Weight_Decay
+        self.Batch_Size = Batch_Size
+        self.Num_Train_Epochs = Num_Train_Epochs
+        self.gradient_accumulation_steps = gradient_accumulation_steps
+        self.warmup_ratio = warmup_ratio
+        self.lr_scheduler_type = lr_scheduler_type
+        self.seed = seed
+        self.num_beams = num_beams
+        self.generation_max_length = generation_max_length
+        self.fp16 = fp16
+        self.early_stopping_patience = early_stopping_patience
+        self.logging_steps = logging_steps
+        self.report_to = report_to
+        self._rouge = evaluate.load("rouge")
+        self._tokenizer = None
+        self._model = None
+    # =========================================================
+    # 1️⃣  Đọc dữ liệu JSONL hoặc Arrow
+    # =========================================================
+    def _load_jsonl_to_datasetdict(self, DataPath: str) -> DatasetDict:
+        print(f"Đang tải dữ liệu từ {DataPath} ...")
+        data_list = []
+        with open(DataPath, "r", encoding="utf-8") as f:
+            for line in f:
+                if not line.strip():
+                    continue
+                try:
+                    data_list.append(json.loads(line))
+                except json.JSONDecodeError:
+                    continue
+        df = pd.DataFrame(data_list)
+        if self.input_column not in df or self.target_column not in df:
+            raise ValueError(f"File {DataPath} thiếu cột {self.input_column}/{self.target_column}")
+        df = df[[self.input_column, self.target_column]].dropna()
+        dataset = Dataset.from_pandas(df, preserve_index=False)
+        split = dataset.train_test_split(test_size=0.1, seed=self.seed)
+        print(f"✔ Dữ liệu chia: {len(split['train'])} train / {len(split['test'])} validation")
+        return DatasetDict({"train": split["train"], "validation": split["test"]})
+    def _ensure_datasetdict(self, dataset: Optional[Union[Dataset, DatasetDict]], DataPath: Optional[str]) -> DatasetDict:
+        if dataset is not None:
+            if isinstance(dataset, DatasetDict):
+                return dataset
+            if isinstance(dataset, Dataset):
+                split = dataset.train_test_split(test_size=0.1, seed=self.seed)
+                return DatasetDict({"train": split["train"], "validation": split["test"]})
+            raise TypeError("dataset phải là datasets.Dataset hoặc datasets.DatasetDict.")
+        if DataPath:
+            if os.path.isdir(DataPath):
+                print(f"Load DatasetDict từ thư mục Arrow: {DataPath}")
+                return load_from_disk(DataPath)
+            return self._load_jsonl_to_datasetdict(DataPath)
+        raise ValueError("Cần truyền dataset hoặc DataPath")
+    # =========================================================
+    # 2️⃣  Token hóa
+    # =========================================================
+    def _preprocess_function(self, examples):
+        inputs = examples[self.input_column]
+        if self.prefix:
+            inputs = [self.prefix + x for x in inputs]
+        model_inputs = self._tokenizer(inputs, max_length=self.Max_Input_Length, truncation=True)
+        with self._tokenizer.as_target_tokenizer():
+            labels = self._tokenizer(examples[self.target_column], max_length=self.Max_Target_Length, truncation=True)
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+    # =========================================================
+    # 3️⃣  Tính điểm ROUGE
+    # =========================================================
+    def _compute_metrics(self, eval_pred):
+        preds, labels = eval_pred
+        decoded_preds = self._tokenizer.batch_decode(preds, skip_special_tokens=True)
+        labels = np.where(labels != -100, labels, self._tokenizer.pad_token_id)
+        decoded_labels = self._tokenizer.batch_decode(labels, skip_special_tokens=True)
+        result = self._rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+        return {k: round(v * 100, 4) for k, v in result.items()}
+    # =========================================================
+    # 4️⃣  Chạy huấn luyện
+    # =========================================================
+    def run(
+        self,
+        Checkpoint: str,
+        ModelPath: str,
+        DataPath: Optional[str] = None,
+        dataset: Optional[Union[Dataset, DatasetDict]] = None,
+        tokenizer: Optional[AutoTokenizer] = None,
+    ):
+        set_seed(self.seed)
+        ds = self._ensure_datasetdict(dataset, DataPath)
+        self._tokenizer = tokenizer or AutoTokenizer.from_pretrained(Checkpoint)
+        print(f"Tải model checkpoint: {Checkpoint}")
+        self._model = AutoModelForSeq2SeqLM.from_pretrained(Checkpoint)
+        print("Tokenizing dữ liệu ...")
+        tokenized = ds.map(self._preprocess_function, batched=True)
+        data_collator = DataCollatorForSeq2Seq(tokenizer=self._tokenizer, model=self._model)
+        gen_max_len = self.generation_max_length or self.Max_Target_Length
+        training_args = Seq2SeqTrainingArguments(
+            output_dir=ModelPath,
+            evaluation_strategy="epoch",
+            save_strategy="epoch",
+            learning_rate=self.Learning_Rate,
+            per_device_train_batch_size=self.Batch_Size,
+            per_device_eval_batch_size=self.Batch_Size,
+            weight_decay=self.Weight_Decay,
+            num_train_epochs=self.Num_Train_Epochs,
+            predict_with_generate=True,
+            generation_max_length=gen_max_len,
+            generation_num_beams=self.num_beams,
+            fp16=self.fp16,
+            gradient_accumulation_steps=self.gradient_accumulation_steps,
+            warmup_ratio=self.warmup_ratio,
+            lr_scheduler_type=self.lr_scheduler_type,
+            logging_steps=self.logging_steps,
+            load_best_model_at_end=True,
+            metric_for_best_model="rougeL",
+            greater_is_better=True,
+            save_total_limit=3,
+            report_to=self.report_to,
+        )
+        trainer = Seq2SeqTrainer(
+            model=self._model,
+            args=training_args,
+            train_dataset=tokenized["train"],
+            eval_dataset=tokenized["validation"],
+            tokenizer=self._tokenizer,
+            data_collator=data_collator,
+            compute_metrics=self._compute_metrics,
+            callbacks=[EarlyStoppingCallback(early_stopping_patience=self.early_stopping_patience)],
+        )
+        print("\n🚀 BẮT ĐẦU HUẤN LUYỆN ...")
+        trainer.train()
+        print("✅ HUẤN LUYỆN HOÀN TẤT.")
+        trainer.save_model(ModelPath)
+        self._tokenizer.save_pretrained(ModelPath)
+        print(f"💾 Đã lưu model & tokenizer tại: {ModelPath}")
+        return trainer
+    # =========================================================
+    # 5️⃣  Sinh tóm tắt
+    # =========================================================
+    def generate(self, text: str, max_new_tokens: Optional[int] = None) -> str:
+        if self._model is None or self._tokenizer is None:
+            raise RuntimeError("Model/tokenizer chưa khởi tạo, hãy gọi run() trước.")
+        prompt = (self.prefix + text) if self.prefix else text
+        inputs = self._tokenizer(prompt, return_tensors="pt", truncation=True, max_length=self.Max_Input_Length)
+        gen_len = max_new_tokens or self.Max_Target_Length
+        outputs = self._model.generate(**inputs, max_new_tokens=gen_len, num_beams=self.num_beams)
+        return self._tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # =========================================================
+    # 6️⃣  Load lại Dataset Arrow
+    # =========================================================
+    @staticmethod
+    def load_local_dataset(DataPath: str) -> DatasetDict:
+        return load_from_disk(DataPath)

README.md CHANGED Viewed

@@ -1,10 +1,27 @@
----
-title: Doc Ai Api
-emoji: 📊
-colorFrom: yellow
-colorTo: red
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# HF_Space_API (FastAPI + Docker)
+This Space exposes a **REST API** for document processing (summarize, embed, search, process PDFs).
+It is designed to **wrap your existing project** with minimal changes.
+## How to use
+1. Create a new **Hugging Face Space** → **Docker** template.
+2. Push this folder to that Space (or upload as a ZIP and use the Space UI).
+3. Set Space **Hardware** (e.g., T4 or A10G for GPU) and add **Space secrets** if needed:
+   - `HF_TOKEN` (optional, for private models)
+   - `API_SECRET` (optional, simple bearer auth)
+4. Wire your own modules inside `app.py` (see TODO markers).
+## Endpoints
+- `GET /health`
+- `POST /embed` — JSON: `{ "texts": ["..."] }`
+- `POST /summarize` — JSON: `{ "text": "..." }`
+- `POST /search` — JSON: `{ "query": "...", "k": 5 }` (dummy store until you wire your FAISS index)
+- `POST /process_pdf` — multipart/form-data: `[email protected]`
+## Notes
+- Default uses `faiss-cpu`. On GPU Space you may switch to `faiss-gpu` in `requirements.txt`.
+- Avoid committing local cached models into git. Publish models to the Hub and **download at startup**.

app.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+FastAPI gateway for your App_Caller pipeline.
+✅ Giữ nguyên pipeline gốc (App_Caller.py)
+✅ Tương thích Hugging Face Spaces (Docker)
+✅ Có Bearer token, Swagger UI (/docs)
+✅ Endpoint: /health, /process_pdf, /search, /summarize
+"""
+import os
+import time
+from typing import Optional
+from fastapi import FastAPI, UploadFile, File, HTTPException, Depends, Header
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+# -------------------------
+# 🔒 Bearer token (optional)
+# -------------------------
+API_SECRET = os.getenv("API_SECRET", "").strip()
+def require_bearer(authorization: Optional[str] = Header(None)):
+    """Kiểm tra Bearer token nếu bật API_SECRET."""
+    if not API_SECRET:
+        return  # Không bật xác thực
+    if not authorization or not authorization.startswith("Bearer "):
+        raise HTTPException(status_code=401, detail="Missing Bearer token")
+    token = authorization.split(" ", 1)[1].strip()
+    if token != API_SECRET:
+        raise HTTPException(status_code=403, detail="Invalid token")
+# -------------------------
+# 🧩 Import project modules
+# -------------------------
+try:
+    import App_Caller as APP_CALLER
+    print("✅ Đã load App_Caller.")
+except Exception as e:
+    APP_CALLER = None
+    print(f"⚠️ Không thể import App_Caller: {e}")
+# -------------------------
+# 🚀 Init FastAPI
+# -------------------------
+app = FastAPI(
+    title="Document AI API (FastAPI)",
+    version="2.0.0",
+    description="API xử lý PDF: trích xuất, tóm tắt, tìm kiếm, phân loại.",
+)
+# -------------------------
+# 🩺 /health
+# -------------------------
+@app.get("/health")
+def health(_=Depends(require_bearer)):
+    """Kiểm tra trạng thái hoạt động."""
+    return {
+        "status": "ok",
+        "time": time.time(),
+        "App_Caller": bool(APP_CALLER),
+        "has_fileProcess": hasattr(APP_CALLER, "fileProcess") if APP_CALLER else False,
+    }
+# -------------------------
+# 📘 /process_pdf
+# -------------------------
+@app.post("/process_pdf")
+async def process_pdf(file: UploadFile = File(...), _=Depends(require_bearer)):
+    """Nhận file PDF → chạy App_Caller.fileProcess → trả về summary + category."""
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Chỉ chấp nhận file PDF.")
+    pdf_bytes = await file.read()
+    if not APP_CALLER or not hasattr(APP_CALLER, "fileProcess"):
+        raise HTTPException(status_code=500, detail="Không tìm thấy App_Caller.fileProcess().")
+    try:
+        result = APP_CALLER.fileProcess(pdf_bytes)
+        return {
+            "status": "success",
+            "checkstatus": result.get("checkstatus"),
+            "summary": result.get("summary"),
+            "category": result.get("category"),
+            "top_candidates": result.get("reranked", []),
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Lỗi xử lý PDF: {str(e)}")
+# -------------------------
+# 🔍 /search
+# -------------------------
+class SearchIn(BaseModel):
+    query: str
+    k: int = 10
+@app.post("/search")
+def search(body: SearchIn, _=Depends(require_bearer)):
+    """Tìm kiếm bằng FAISS + Rerank từ App_Caller.runSearch()."""
+    q = (body.query or "").strip()
+    if not q:
+        raise HTTPException(status_code=400, detail="query không được để trống")
+    if not APP_CALLER or not hasattr(APP_CALLER, "runSearch"):
+        raise HTTPException(status_code=500, detail="Không tìm thấy App_Caller.runSearch().")
+    try:
+        results = APP_CALLER.runSearch(q)
+        if isinstance(results, list):
+            formatted = results[:body.k]
+        elif isinstance(results, dict) and "results" in results:
+            formatted = results["results"][:body.k]
+        else:
+            formatted = [str(results)]
+        return {"status": "success", "results": formatted}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Lỗi tìm kiếm: {str(e)}")
+# -------------------------
+# 🧠 /summarize
+# -------------------------
+class SummIn(BaseModel):
+    text: str
+    minInput: int = 256
+    maxInput: int = 1024
+@app.post("/summarize")
+def summarize_text(body: SummIn, _=Depends(require_bearer)):
+    """Tóm tắt văn bản bằng App_Caller.summarizer_engine."""
+    text = (body.text or "").strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="text không được để trống")
+    if not APP_CALLER or not hasattr(APP_CALLER, "summarizer_engine"):
+        raise HTTPException(status_code=500, detail="Không tìm thấy App_Caller.summarizer_engine.")
+    try:
+        summarized = APP_CALLER.summarizer_engine.summarize(
+            text, minInput=body.minInput, maxInput=body.maxInput
+        )
+        return {"status": "success", "summary": summarized.get("summary_text", "")}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Lỗi tóm tắt: {str(e)}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# ===============================
+# Core Python / AI Environment
+# ===============================
+torch==2.3.1
+torchvision==0.18.1
+torchaudio==2.3.1
+pytorch-cuda==12.1  # nếu dùng conda, pip sẽ bỏ qua
+# ===============================
+# Transformers Ecosystem
+# ===============================
+transformers==4.44.2
+sentence-transformers==3.0.1
+tokenizers>=0.19.1
+huggingface-hub>=0.23.4
+safetensors>=0.4.3
+accelerate==0.31.0
+datasets>=2.19.0
+evaluate>=0.4.2
+sentencepiece>=0.2.0
+protobuf>=4.25.2
+nltk>=3.9
+rouge-score>=0.1.2
+# ===============================
+# Semantic Search / FAISS
+# ===============================
+faiss-gpu==1.8.0
+# ===============================
+# PDF / Text Processing
+# ===============================
+PyMuPDF==1

start.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/env bash
+set -e
+# Uvicorn picks PORT from env (Spaces sets it). HOST is 0.0.0.0 for external access.
+# Workers=1 to keep memory predictable on small machines; tune up if needed.
+exec uvicorn app:app --host "${HOST:-0.0.0.0}" --port "${PORT:-7860}" --workers 1