DermalCare / tools /build_meta_only.py
pixel3user
added images
d924ea0
# tools/build_meta_only.py
import os, json
from pathlib import Path
DATA_CANDIDATES = [
"data/cosmetics_full_en_zh.jsonl",
"data/cosmetic_full_en_zh.jsonl",
"cosmetics_full_en_zh.jsonl",
"cosmetic_full_en_zh.jsonl",
]
OUT_DIR = Path("indexes")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_META = OUT_DIR / "cosmetics_meta.json"
def find_dataset():
for p in DATA_CANDIDATES:
if os.path.exists(p):
return p
raise FileNotFoundError("Dataset JSONL not found in expected locations.")
def main():
ds = find_dataset()
records = [json.loads(l) for l in open(ds, "r", encoding="utf-8")]
meta = []
for r in records:
meta.append({
"id": r.get("id"),
"brand_en": r.get("brand_en"),
"brand_zh": r.get("brand_zh"),
"product_name_en": r.get("product_name_en"),
"product_name_zh": r.get("product_name_zh"),
"category_en": r.get("category_en"),
"category_zh": r.get("category_zh"),
"price_value": r.get("price_value"),
"price_currency": r.get("price_currency"),
"source_url": r.get("source_url"),
"image_url": r.get("image_url"), # <-- now included
})
with open(OUT_META, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"✅ wrote {len(meta)} records -> {OUT_META}")
if __name__ == "__main__":
main()