# tools/build_meta_only.py import os, json from pathlib import Path DATA_CANDIDATES = [ "data/cosmetics_full_en_zh.jsonl", "data/cosmetic_full_en_zh.jsonl", "cosmetics_full_en_zh.jsonl", "cosmetic_full_en_zh.jsonl", ] OUT_DIR = Path("indexes") OUT_DIR.mkdir(parents=True, exist_ok=True) OUT_META = OUT_DIR / "cosmetics_meta.json" def find_dataset(): for p in DATA_CANDIDATES: if os.path.exists(p): return p raise FileNotFoundError("Dataset JSONL not found in expected locations.") def main(): ds = find_dataset() records = [json.loads(l) for l in open(ds, "r", encoding="utf-8")] meta = [] for r in records: meta.append({ "id": r.get("id"), "brand_en": r.get("brand_en"), "brand_zh": r.get("brand_zh"), "product_name_en": r.get("product_name_en"), "product_name_zh": r.get("product_name_zh"), "category_en": r.get("category_en"), "category_zh": r.get("category_zh"), "price_value": r.get("price_value"), "price_currency": r.get("price_currency"), "source_url": r.get("source_url"), "image_url": r.get("image_url"), # <-- now included }) with open(OUT_META, "w", encoding="utf-8") as f: json.dump(meta, f, ensure_ascii=False, indent=2) print(f"✅ wrote {len(meta)} records -> {OUT_META}") if __name__ == "__main__": main()