Spaces:

rishabhsetiya
/

CAIAssignmentGradio

Sleeping

App Files Files Community

rishabhsetiya commited on Aug 24

Commit

c69001b

verified ·

1 Parent(s): e56befa

Create generate_indexes.py

Browse files

Files changed (1) hide show

generate_indexes.py +163 -0

generate_indexes.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import os
+import re
+import json
+import pickle
+from typing import List, Dict
+import numpy as np
+import faiss
+import pandas as pd
+import tabula
+from sentence_transformers import SentenceTransformer
+from rank_bm25 import BM25Okapi
+# ---------------- Config ----------------
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+PDF_PATH    = "MakeMyTrip_Financial_Statements.pdf"
+OUT_DIR     = "data/index_merged"
+# Paths for saved chunks & indices
+CHUNKS_100_PATH = os.path.join(OUT_DIR, "chunks_100.json")
+CHUNKS_400_PATH = os.path.join(OUT_DIR, "chunks_400.json")
+CHUNKS_MERGED_PATH = os.path.join(OUT_DIR, "chunks_merged.json")
+FAISS_PATH    = os.path.join(OUT_DIR, "faiss_merged.index")
+BM25_PATH     = os.path.join(OUT_DIR, "bm25_merged.pkl")
+META_PATH     = os.path.join(OUT_DIR, "meta_merged.pkl")
+# ---------------- Utils ----------------
+_tok_pat = re.compile(r"[a-z0-9]+", re.I)
+def simple_tokenize(text: str):
+    return _tok_pat.findall((text or "").lower())
+def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
+    """Simple word-based tokenizer to split texts into chunks."""
+    chunks, current_chunk, current_tokens = [], [], 0
+    for text in texts:
+        tokens = re.findall(r"\w+", text)
+        if current_tokens + len(tokens) > max_tokens:
+            chunks.append(" ".join(current_chunk))
+            current_chunk, current_tokens = [], 0
+        current_chunk.append(text)
+        current_tokens += len(tokens)
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
+def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
+    """Extract tables from financial PDF into structured row-year-value dicts."""
+    tables = tabula.read_pdf(
+        pdf_path,
+        pages=pages,
+        multiple_tables=True,
+        pandas_options={'dtype': str}
+    )
+    table_rows = []
+    row_id = 0
+    for df in tables:
+        if df.empty:
+            continue
+        df = df.replace(r'\n', ' ', regex=True).fillna("")
+        headers = list(df.iloc[0])
+        if any(re.match(r"20\d{2}", str(c)) for c in headers):
+            df.columns = [c.strip() for c in headers]
+            df = df.drop(0).reset_index(drop=True)
+        for _, row in df.iterrows():
+            metric = str(row.iloc[0]).strip()
+            if not metric or metric.lower() in ["note", ""]:
+                continue
+            values = {}
+            for col, val in row.items():
+                if re.match(r"20\d{2}", str(col)):
+                    clean_val = str(val).replace(",", "").strip()
+                    if clean_val and clean_val not in ["-", "—", "nan"]:
+                        values[str(col)] = clean_val
+            if not values:
+                continue
+            table_rows.append({
+                "id": f"table-{row_id}",
+                "metric": metric,
+                "years": list(values.keys()),
+                "values": values,
+                "content": f"{metric} values: {json.dumps(values)}",
+                "source": "table"
+            })
+            row_id += 1
+    print(f"Extracted {len(table_rows)} rows from PDF tables")
+    return table_rows
+def build_dense_faiss(texts: List[str], out_path: str):
+    print(f"Embedding {len(texts)} docs with {EMBED_MODEL} ...")
+    model = SentenceTransformer(EMBED_MODEL)
+    emb = model.encode(texts, convert_to_numpy=True, batch_size=64, show_progress_bar=True)
+    faiss.normalize_L2(emb)
+    dim = emb.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(emb)
+    faiss.write_index(index, out_path)
+    print(f"FAISS index built & saved -> {out_path}")
+def build_bm25(texts: List[str], out_path: str):
+    tokenized = [simple_tokenize(t) for t in texts]
+    bm25 = BM25Okapi(tokenized)
+    with open(out_path, "wb") as f:
+        pickle.dump({"bm25": bm25, "tokenized_corpus": tokenized}, f)
+    print(f"BM25 index built & saved -> {out_path}")
+# ---------------- Main ----------------
+def main():
+    os.makedirs(OUT_DIR, exist_ok=True)
+    # 1) Extract table rows
+    docs = extract_tables_from_pdf(PDF_PATH, pages="all")
+    all_texts = [d["content"] for d in docs]
+    # 2) Create chunks of size 100 and 400
+    chunks_100 = create_chunks(all_texts, 100)
+    chunks_400 = create_chunks(all_texts, 400)
+    # 3) Save them separately
+    with open(CHUNKS_100_PATH, "w", encoding="utf-8") as f:
+        json.dump(chunks_100, f, indent=2, ensure_ascii=False)
+    with open(CHUNKS_400_PATH, "w", encoding="utf-8") as f:
+        json.dump(chunks_400, f, indent=2, ensure_ascii=False)
+    print(f"Saved {len(chunks_100)} chunks_100 -> {CHUNKS_100_PATH}")
+    print(f"Saved {len(chunks_400)} chunks_400 -> {CHUNKS_400_PATH}")
+    # 4) Merge with metadata
+    merged = []
+    for i, ch in enumerate(chunks_100):
+        merged.append({"id": f"100-{i}", "chunk_size": 100, "content": ch})
+    for i, ch in enumerate(chunks_400):
+        merged.append({"id": f"400-{i}", "chunk_size": 400, "content": ch})
+    # 5) Save merged chunks
+    with open(CHUNKS_MERGED_PATH, "w", encoding="utf-8") as f:
+        json.dump(merged, f, indent=2, ensure_ascii=False)
+    print(f"Saved {len(merged)} merged chunks -> {CHUNKS_MERGED_PATH}")
+    # 6) Build FAISS & BM25 on merged chunks
+    texts = [m["content"] for m in merged]
+    build_dense_faiss(texts, FAISS_PATH)
+    build_bm25(texts, BM25_PATH)
+    # 7) Save metadata
+    with open(META_PATH, "wb") as f:
+        pickle.dump(merged, f)
+    print(f"Saved metadata -> {META_PATH}")
+    print("\n✅ Done. Created 100 + 400 chunks separately and merged them for unified FAISS & BM25 indexes!")
+if __name__ == "__main__":
+    main()