DermalCare / tools /build_index.py
Manik Sheokand
aded recommendor
3e46c4a
import os, json, faiss, numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
DATA_CANDIDATES = [
"data/cosmetics_full_en_zh.jsonl",
"data/cosmetic_full_en_zh.jsonl", # user variant
"cosmetics_full_en_zh.jsonl",
"cosmetic_full_en_zh.jsonl",
]
def find_dataset():
for p in DATA_CANDIDATES:
if os.path.exists(p):
return p
raise FileNotFoundError("Could not find dataset. Put it at data/cosmetics_full_en_zh.jsonl (or cosmetic_full_en_zh.jsonl).")
DATA_PATH = find_dataset()
INDEX_DIR = Path("indexes"); INDEX_DIR.mkdir(parents=True, exist_ok=True)
INDEX_PATH = INDEX_DIR / "cosmetics_faiss_ip.index"
META_PATH = INDEX_DIR / "cosmetics_meta.json"
print(f"βœ… Using dataset: {DATA_PATH}")
# 1) Load
records = [json.loads(l) for l in open(DATA_PATH, "r", encoding="utf-8")]
# 2) Build the text to embed (EN + ZH)
def text_of(r):
return " \n".join([
str(r.get("product_name_en") or ""),
str(r.get("product_name_zh") or ""),
str(r.get("category_en") or ""),
str(r.get("category_zh") or ""),
str(r.get("description_en") or ""),
str(r.get("description_zh") or ""),
str(r.get("concerns_en") or ""),
str(r.get("concerns_zh") or ""),
str(r.get("key_ingredients_en") or ""),
str(r.get("key_ingredients_zh") or ""),
str(r.get("usage_en") or ""),
str(r.get("usage_zh") or ""),
]).strip()
corpus = [text_of(r) for r in records]
# 3) Encode (multilingual)
model_id = os.environ.get("EMB_MODEL_ID", "intfloat/multilingual-e5-base")
print(f"🧠 Embedding with: {model_id}")
model = SentenceTransformer(model_id)
embs = model.encode(corpus, normalize_embeddings=True, batch_size=64)
# 4) FAISS index (cosine via inner product on normalized vectors)
d = embs.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embs.astype("float32"))
faiss.write_index(index, str(INDEX_PATH))
# 5) Light metadata (quick lookup after search)
meta = []
for r in records:
meta.append({
"id": r.get("id"),
"brand_en": r.get("brand_en"),
"brand_zh": r.get("brand_zh"),
"product_name_en": r.get("product_name_en"),
"product_name_zh": r.get("product_name_zh"),
"category_en": r.get("category_en"),
"category_zh": r.get("category_zh"),
"price_value": r.get("price_value"),
"price_currency": r.get("price_currency"),
"source_url": r.get("source_url"),
})
with open(META_PATH, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
print("πŸŽ‰ Built index + metadata")
print(f" - {INDEX_PATH}")
print(f" - {META_PATH}")