Spaces:
Runtime error
Runtime error
| import os, json, faiss, numpy as np | |
| from pathlib import Path | |
| from sentence_transformers import SentenceTransformer | |
| DATA_CANDIDATES = [ | |
| "data/cosmetics_full_en_zh.jsonl", | |
| "data/cosmetic_full_en_zh.jsonl", # user variant | |
| "cosmetics_full_en_zh.jsonl", | |
| "cosmetic_full_en_zh.jsonl", | |
| ] | |
| def find_dataset(): | |
| for p in DATA_CANDIDATES: | |
| if os.path.exists(p): | |
| return p | |
| raise FileNotFoundError("Could not find dataset. Put it at data/cosmetics_full_en_zh.jsonl (or cosmetic_full_en_zh.jsonl).") | |
| DATA_PATH = find_dataset() | |
| INDEX_DIR = Path("indexes"); INDEX_DIR.mkdir(parents=True, exist_ok=True) | |
| INDEX_PATH = INDEX_DIR / "cosmetics_faiss_ip.index" | |
| META_PATH = INDEX_DIR / "cosmetics_meta.json" | |
| print(f"β Using dataset: {DATA_PATH}") | |
| # 1) Load | |
| records = [json.loads(l) for l in open(DATA_PATH, "r", encoding="utf-8")] | |
| # 2) Build the text to embed (EN + ZH) | |
| def text_of(r): | |
| return " \n".join([ | |
| str(r.get("product_name_en") or ""), | |
| str(r.get("product_name_zh") or ""), | |
| str(r.get("category_en") or ""), | |
| str(r.get("category_zh") or ""), | |
| str(r.get("description_en") or ""), | |
| str(r.get("description_zh") or ""), | |
| str(r.get("concerns_en") or ""), | |
| str(r.get("concerns_zh") or ""), | |
| str(r.get("key_ingredients_en") or ""), | |
| str(r.get("key_ingredients_zh") or ""), | |
| str(r.get("usage_en") or ""), | |
| str(r.get("usage_zh") or ""), | |
| ]).strip() | |
| corpus = [text_of(r) for r in records] | |
| # 3) Encode (multilingual) | |
| model_id = os.environ.get("EMB_MODEL_ID", "intfloat/multilingual-e5-base") | |
| print(f"π§ Embedding with: {model_id}") | |
| model = SentenceTransformer(model_id) | |
| embs = model.encode(corpus, normalize_embeddings=True, batch_size=64) | |
| # 4) FAISS index (cosine via inner product on normalized vectors) | |
| d = embs.shape[1] | |
| index = faiss.IndexFlatIP(d) | |
| index.add(embs.astype("float32")) | |
| faiss.write_index(index, str(INDEX_PATH)) | |
| # 5) Light metadata (quick lookup after search) | |
| meta = [] | |
| for r in records: | |
| meta.append({ | |
| "id": r.get("id"), | |
| "brand_en": r.get("brand_en"), | |
| "brand_zh": r.get("brand_zh"), | |
| "product_name_en": r.get("product_name_en"), | |
| "product_name_zh": r.get("product_name_zh"), | |
| "category_en": r.get("category_en"), | |
| "category_zh": r.get("category_zh"), | |
| "price_value": r.get("price_value"), | |
| "price_currency": r.get("price_currency"), | |
| "source_url": r.get("source_url"), | |
| }) | |
| with open(META_PATH, "w", encoding="utf-8") as f: | |
| json.dump(meta, f, ensure_ascii=False, indent=2) | |
| print("π Built index + metadata") | |
| print(f" - {INDEX_PATH}") | |
| print(f" - {META_PATH}") | |