import os, json, faiss, numpy as np from pathlib import Path from sentence_transformers import SentenceTransformer DATA_CANDIDATES = [ "data/cosmetics_full_en_zh.jsonl", "data/cosmetic_full_en_zh.jsonl", # user variant "cosmetics_full_en_zh.jsonl", "cosmetic_full_en_zh.jsonl", ] def find_dataset(): for p in DATA_CANDIDATES: if os.path.exists(p): return p raise FileNotFoundError("Could not find dataset. Put it at data/cosmetics_full_en_zh.jsonl (or cosmetic_full_en_zh.jsonl).") DATA_PATH = find_dataset() INDEX_DIR = Path("indexes"); INDEX_DIR.mkdir(parents=True, exist_ok=True) INDEX_PATH = INDEX_DIR / "cosmetics_faiss_ip.index" META_PATH = INDEX_DIR / "cosmetics_meta.json" print(f"✅ Using dataset: {DATA_PATH}") # 1) Load records = [json.loads(l) for l in open(DATA_PATH, "r", encoding="utf-8")] # 2) Build the text to embed (EN + ZH) def text_of(r): return " \n".join([ str(r.get("product_name_en") or ""), str(r.get("product_name_zh") or ""), str(r.get("category_en") or ""), str(r.get("category_zh") or ""), str(r.get("description_en") or ""), str(r.get("description_zh") or ""), str(r.get("concerns_en") or ""), str(r.get("concerns_zh") or ""), str(r.get("key_ingredients_en") or ""), str(r.get("key_ingredients_zh") or ""), str(r.get("usage_en") or ""), str(r.get("usage_zh") or ""), ]).strip() corpus = [text_of(r) for r in records] # 3) Encode (multilingual) model_id = os.environ.get("EMB_MODEL_ID", "intfloat/multilingual-e5-base") print(f"🧠 Embedding with: {model_id}") model = SentenceTransformer(model_id) embs = model.encode(corpus, normalize_embeddings=True, batch_size=64) # 4) FAISS index (cosine via inner product on normalized vectors) d = embs.shape[1] index = faiss.IndexFlatIP(d) index.add(embs.astype("float32")) faiss.write_index(index, str(INDEX_PATH)) # 5) Light metadata (quick lookup after search) meta = [] for r in records: meta.append({ "id": r.get("id"), "brand_en": r.get("brand_en"), "brand_zh": r.get("brand_zh"), "product_name_en": r.get("product_name_en"), "product_name_zh": r.get("product_name_zh"), "category_en": r.get("category_en"), "category_zh": r.get("category_zh"), "price_value": r.get("price_value"), "price_currency": r.get("price_currency"), "source_url": r.get("source_url"), }) with open(META_PATH, "w", encoding="utf-8") as f: json.dump(meta, f, ensure_ascii=False, indent=2) print("🎉 Built index + metadata") print(f" - {INDEX_PATH}") print(f" - {META_PATH}")