File size: 2,696 Bytes
3e46c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os, json, faiss, numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

DATA_CANDIDATES = [
    "data/cosmetics_full_en_zh.jsonl",
    "data/cosmetic_full_en_zh.jsonl",            # user variant
    "cosmetics_full_en_zh.jsonl",
    "cosmetic_full_en_zh.jsonl",
]

def find_dataset():
    for p in DATA_CANDIDATES:
        if os.path.exists(p):
            return p
    raise FileNotFoundError("Could not find dataset. Put it at data/cosmetics_full_en_zh.jsonl (or cosmetic_full_en_zh.jsonl).")

DATA_PATH = find_dataset()
INDEX_DIR = Path("indexes"); INDEX_DIR.mkdir(parents=True, exist_ok=True)
INDEX_PATH = INDEX_DIR / "cosmetics_faiss_ip.index"
META_PATH  = INDEX_DIR / "cosmetics_meta.json"

print(f"✅ Using dataset: {DATA_PATH}")

# 1) Load
records = [json.loads(l) for l in open(DATA_PATH, "r", encoding="utf-8")]

# 2) Build the text to embed (EN + ZH)
def text_of(r):
    return " \n".join([
        str(r.get("product_name_en") or ""),
        str(r.get("product_name_zh") or ""),
        str(r.get("category_en") or ""),
        str(r.get("category_zh") or ""),
        str(r.get("description_en") or ""),
        str(r.get("description_zh") or ""),
        str(r.get("concerns_en") or ""),
        str(r.get("concerns_zh") or ""),
        str(r.get("key_ingredients_en") or ""),
        str(r.get("key_ingredients_zh") or ""),
        str(r.get("usage_en") or ""),
        str(r.get("usage_zh") or ""),
    ]).strip()

corpus = [text_of(r) for r in records]

# 3) Encode (multilingual)
model_id = os.environ.get("EMB_MODEL_ID", "intfloat/multilingual-e5-base")
print(f"🧠 Embedding with: {model_id}")
model = SentenceTransformer(model_id)
embs = model.encode(corpus, normalize_embeddings=True, batch_size=64)

# 4) FAISS index (cosine via inner product on normalized vectors)
d = embs.shape[1]
index = faiss.IndexFlatIP(d)
index.add(embs.astype("float32"))
faiss.write_index(index, str(INDEX_PATH))

# 5) Light metadata (quick lookup after search)
meta = []
for r in records:
    meta.append({
        "id": r.get("id"),
        "brand_en": r.get("brand_en"),
        "brand_zh": r.get("brand_zh"),
        "product_name_en": r.get("product_name_en"),
        "product_name_zh": r.get("product_name_zh"),
        "category_en": r.get("category_en"),
        "category_zh": r.get("category_zh"),
        "price_value": r.get("price_value"),
        "price_currency": r.get("price_currency"),
        "source_url": r.get("source_url"),
    })

with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("🎉 Built index + metadata")
print(f"  - {INDEX_PATH}")
print(f"  - {META_PATH}")