rishabhsetiya commited on
Commit
c69001b
·
verified ·
1 Parent(s): e56befa

Create generate_indexes.py

Browse files
Files changed (1) hide show
  1. generate_indexes.py +163 -0
generate_indexes.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import pickle
5
+ from typing import List, Dict
6
+
7
+ import numpy as np
8
+ import faiss
9
+ import pandas as pd
10
+ import tabula
11
+ from sentence_transformers import SentenceTransformer
12
+ from rank_bm25 import BM25Okapi
13
+
14
+ # ---------------- Config ----------------
15
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
16
+
17
+ PDF_PATH = "MakeMyTrip_Financial_Statements.pdf"
18
+ OUT_DIR = "data/index_merged"
19
+
20
+ # Paths for saved chunks & indices
21
+ CHUNKS_100_PATH = os.path.join(OUT_DIR, "chunks_100.json")
22
+ CHUNKS_400_PATH = os.path.join(OUT_DIR, "chunks_400.json")
23
+ CHUNKS_MERGED_PATH = os.path.join(OUT_DIR, "chunks_merged.json")
24
+
25
+ FAISS_PATH = os.path.join(OUT_DIR, "faiss_merged.index")
26
+ BM25_PATH = os.path.join(OUT_DIR, "bm25_merged.pkl")
27
+ META_PATH = os.path.join(OUT_DIR, "meta_merged.pkl")
28
+
29
+ # ---------------- Utils ----------------
30
+ _tok_pat = re.compile(r"[a-z0-9]+", re.I)
31
+ def simple_tokenize(text: str):
32
+ return _tok_pat.findall((text or "").lower())
33
+
34
+ def create_chunks(texts: List[str], max_tokens: int) -> List[str]:
35
+ """Simple word-based tokenizer to split texts into chunks."""
36
+ chunks, current_chunk, current_tokens = [], [], 0
37
+ for text in texts:
38
+ tokens = re.findall(r"\w+", text)
39
+ if current_tokens + len(tokens) > max_tokens:
40
+ chunks.append(" ".join(current_chunk))
41
+ current_chunk, current_tokens = [], 0
42
+ current_chunk.append(text)
43
+ current_tokens += len(tokens)
44
+ if current_chunk:
45
+ chunks.append(" ".join(current_chunk))
46
+ return chunks
47
+
48
+ def extract_tables_from_pdf(pdf_path: str, pages="all") -> List[Dict]:
49
+ """Extract tables from financial PDF into structured row-year-value dicts."""
50
+ tables = tabula.read_pdf(
51
+ pdf_path,
52
+ pages=pages,
53
+ multiple_tables=True,
54
+ pandas_options={'dtype': str}
55
+ )
56
+
57
+ table_rows = []
58
+ row_id = 0
59
+
60
+ for df in tables:
61
+ if df.empty:
62
+ continue
63
+
64
+ df = df.replace(r'\n', ' ', regex=True).fillna("")
65
+
66
+ headers = list(df.iloc[0])
67
+ if any(re.match(r"20\d{2}", str(c)) for c in headers):
68
+ df.columns = [c.strip() for c in headers]
69
+ df = df.drop(0).reset_index(drop=True)
70
+
71
+ for _, row in df.iterrows():
72
+ metric = str(row.iloc[0]).strip()
73
+ if not metric or metric.lower() in ["note", ""]:
74
+ continue
75
+
76
+ values = {}
77
+ for col, val in row.items():
78
+ if re.match(r"20\d{2}", str(col)):
79
+ clean_val = str(val).replace(",", "").strip()
80
+ if clean_val and clean_val not in ["-", "—", "nan"]:
81
+ values[str(col)] = clean_val
82
+
83
+ if not values:
84
+ continue
85
+
86
+ table_rows.append({
87
+ "id": f"table-{row_id}",
88
+ "metric": metric,
89
+ "years": list(values.keys()),
90
+ "values": values,
91
+ "content": f"{metric} values: {json.dumps(values)}",
92
+ "source": "table"
93
+ })
94
+ row_id += 1
95
+
96
+ print(f"Extracted {len(table_rows)} rows from PDF tables")
97
+ return table_rows
98
+
99
+ def build_dense_faiss(texts: List[str], out_path: str):
100
+ print(f"Embedding {len(texts)} docs with {EMBED_MODEL} ...")
101
+ model = SentenceTransformer(EMBED_MODEL)
102
+ emb = model.encode(texts, convert_to_numpy=True, batch_size=64, show_progress_bar=True)
103
+ faiss.normalize_L2(emb)
104
+ dim = emb.shape[1]
105
+
106
+ index = faiss.IndexFlatIP(dim)
107
+ index.add(emb)
108
+ faiss.write_index(index, out_path)
109
+ print(f"FAISS index built & saved -> {out_path}")
110
+
111
+ def build_bm25(texts: List[str], out_path: str):
112
+ tokenized = [simple_tokenize(t) for t in texts]
113
+ bm25 = BM25Okapi(tokenized)
114
+ with open(out_path, "wb") as f:
115
+ pickle.dump({"bm25": bm25, "tokenized_corpus": tokenized}, f)
116
+ print(f"BM25 index built & saved -> {out_path}")
117
+
118
+ # ---------------- Main ----------------
119
+ def main():
120
+ os.makedirs(OUT_DIR, exist_ok=True)
121
+
122
+ # 1) Extract table rows
123
+ docs = extract_tables_from_pdf(PDF_PATH, pages="all")
124
+ all_texts = [d["content"] for d in docs]
125
+
126
+ # 2) Create chunks of size 100 and 400
127
+ chunks_100 = create_chunks(all_texts, 100)
128
+ chunks_400 = create_chunks(all_texts, 400)
129
+
130
+ # 3) Save them separately
131
+ with open(CHUNKS_100_PATH, "w", encoding="utf-8") as f:
132
+ json.dump(chunks_100, f, indent=2, ensure_ascii=False)
133
+ with open(CHUNKS_400_PATH, "w", encoding="utf-8") as f:
134
+ json.dump(chunks_400, f, indent=2, ensure_ascii=False)
135
+ print(f"Saved {len(chunks_100)} chunks_100 -> {CHUNKS_100_PATH}")
136
+ print(f"Saved {len(chunks_400)} chunks_400 -> {CHUNKS_400_PATH}")
137
+
138
+ # 4) Merge with metadata
139
+ merged = []
140
+ for i, ch in enumerate(chunks_100):
141
+ merged.append({"id": f"100-{i}", "chunk_size": 100, "content": ch})
142
+ for i, ch in enumerate(chunks_400):
143
+ merged.append({"id": f"400-{i}", "chunk_size": 400, "content": ch})
144
+
145
+ # 5) Save merged chunks
146
+ with open(CHUNKS_MERGED_PATH, "w", encoding="utf-8") as f:
147
+ json.dump(merged, f, indent=2, ensure_ascii=False)
148
+ print(f"Saved {len(merged)} merged chunks -> {CHUNKS_MERGED_PATH}")
149
+
150
+ # 6) Build FAISS & BM25 on merged chunks
151
+ texts = [m["content"] for m in merged]
152
+ build_dense_faiss(texts, FAISS_PATH)
153
+ build_bm25(texts, BM25_PATH)
154
+
155
+ # 7) Save metadata
156
+ with open(META_PATH, "wb") as f:
157
+ pickle.dump(merged, f)
158
+ print(f"Saved metadata -> {META_PATH}")
159
+
160
+ print("\n✅ Done. Created 100 + 400 chunks separately and merged them for unified FAISS & BM25 indexes!")
161
+
162
+ if __name__ == "__main__":
163
+ main()