Spaces:

ColdSlim
/

DermalCare

Runtime error

DermalCare / tools /build_index.py

Manik Sheokand

aded recommendor

3e46c4a about 2 months ago

2.7 kB

	import os, json, faiss, numpy as np
	from pathlib import Path
	from sentence_transformers import SentenceTransformer

	DATA_CANDIDATES = [
	"data/cosmetics_full_en_zh.jsonl",
	"data/cosmetic_full_en_zh.jsonl", # user variant
	"cosmetics_full_en_zh.jsonl",
	"cosmetic_full_en_zh.jsonl",
	]

	def find_dataset():
	for p in DATA_CANDIDATES:
	if os.path.exists(p):
	return p
	raise FileNotFoundError("Could not find dataset. Put it at data/cosmetics_full_en_zh.jsonl (or cosmetic_full_en_zh.jsonl).")

	DATA_PATH = find_dataset()
	INDEX_DIR = Path("indexes"); INDEX_DIR.mkdir(parents=True, exist_ok=True)
	INDEX_PATH = INDEX_DIR / "cosmetics_faiss_ip.index"
	META_PATH = INDEX_DIR / "cosmetics_meta.json"

	print(f"✅ Using dataset: {DATA_PATH}")

	# 1) Load
	records = [json.loads(l) for l in open(DATA_PATH, "r", encoding="utf-8")]

	# 2) Build the text to embed (EN + ZH)
	def text_of(r):
	return " \n".join([
	str(r.get("product_name_en") or ""),
	str(r.get("product_name_zh") or ""),
	str(r.get("category_en") or ""),
	str(r.get("category_zh") or ""),
	str(r.get("description_en") or ""),
	str(r.get("description_zh") or ""),
	str(r.get("concerns_en") or ""),
	str(r.get("concerns_zh") or ""),
	str(r.get("key_ingredients_en") or ""),
	str(r.get("key_ingredients_zh") or ""),
	str(r.get("usage_en") or ""),
	str(r.get("usage_zh") or ""),
	]).strip()

	corpus = [text_of(r) for r in records]

	# 3) Encode (multilingual)
	model_id = os.environ.get("EMB_MODEL_ID", "intfloat/multilingual-e5-base")
	print(f"🧠 Embedding with: {model_id}")
	model = SentenceTransformer(model_id)
	embs = model.encode(corpus, normalize_embeddings=True, batch_size=64)

	# 4) FAISS index (cosine via inner product on normalized vectors)
	d = embs.shape[1]
	index = faiss.IndexFlatIP(d)
	index.add(embs.astype("float32"))
	faiss.write_index(index, str(INDEX_PATH))

	# 5) Light metadata (quick lookup after search)
	meta = []
	for r in records:
	meta.append({
	"id": r.get("id"),
	"brand_en": r.get("brand_en"),
	"brand_zh": r.get("brand_zh"),
	"product_name_en": r.get("product_name_en"),
	"product_name_zh": r.get("product_name_zh"),
	"category_en": r.get("category_en"),
	"category_zh": r.get("category_zh"),
	"price_value": r.get("price_value"),
	"price_currency": r.get("price_currency"),
	"source_url": r.get("source_url"),
	})

	with open(META_PATH, "w", encoding="utf-8") as f:
	json.dump(meta, f, ensure_ascii=False, indent=2)

	print("🎉 Built index + metadata")
	print(f" - {INDEX_PATH}")
	print(f" - {META_PATH}")