Spaces:
Configuration error
Configuration error
| import nltk | |
| import pickle | |
| import argparse | |
| from collections import Counter | |
| import json | |
| import os | |
| from tqdm import * | |
| import numpy as np | |
| import re | |
| def get_ingredient(det_ingr, replace_dict): | |
| det_ingr_undrs = det_ingr['text'].lower() | |
| det_ingr_undrs = ''.join(i for i in det_ingr_undrs if not i.isdigit()) | |
| for rep, char_list in replace_dict.items(): | |
| for c_ in char_list: | |
| if c_ in det_ingr_undrs: | |
| det_ingr_undrs = det_ingr_undrs.replace(c_, rep) | |
| det_ingr_undrs = det_ingr_undrs.strip() | |
| det_ingr_undrs = det_ingr_undrs.replace(' ', '_') | |
| return det_ingr_undrs | |
| def remove_plurals(counter_ingrs, ingr_clusters): | |
| del_ingrs = [] | |
| for k, v in counter_ingrs.items(): | |
| if len(k) == 0: | |
| del_ingrs.append(k) | |
| continue | |
| gotit = 0 | |
| if k[-2:] == 'es': | |
| if k[:-2] in counter_ingrs.keys(): | |
| counter_ingrs[k[:-2]] += v | |
| ingr_clusters[k[:-2]].extend(ingr_clusters[k]) | |
| del_ingrs.append(k) | |
| gotit = 1 | |
| if k[-1] == 's' and gotit == 0: | |
| if k[:-1] in counter_ingrs.keys(): | |
| counter_ingrs[k[:-1]] += v | |
| ingr_clusters[k[:-1]].extend(ingr_clusters[k]) | |
| del_ingrs.append(k) | |
| for item in del_ingrs: | |
| del counter_ingrs[item] | |
| del ingr_clusters[item] | |
| return counter_ingrs, ingr_clusters | |
| def cluster_ingredients(counter_ingrs): | |
| mydict = dict() | |
| mydict_ingrs = dict() | |
| for k, v in counter_ingrs.items(): | |
| w1 = k.split('_')[-1] | |
| w2 = k.split('_')[0] | |
| lw = [w1, w2] | |
| if len(k.split('_')) > 1: | |
| w3 = k.split('_')[0] + '_' + k.split('_')[1] | |
| w4 = k.split('_')[-2] + '_' + k.split('_')[-1] | |
| lw = [w1, w2, w4, w3] | |
| gotit = 0 | |
| for w in lw: | |
| if w in counter_ingrs.keys(): | |
| # check if its parts are | |
| parts = w.split('_') | |
| if len(parts) > 0: | |
| if parts[0] in counter_ingrs.keys(): | |
| w = parts[0] | |
| elif parts[1] in counter_ingrs.keys(): | |
| w = parts[1] | |
| if w in mydict.keys(): | |
| mydict[w] += v | |
| mydict_ingrs[w].append(k) | |
| else: | |
| mydict[w] = v | |
| mydict_ingrs[w] = [k] | |
| gotit = 1 | |
| break | |
| if gotit == 0: | |
| mydict[k] = v | |
| mydict_ingrs[k] = [k] | |
| return mydict, mydict_ingrs | |
| def update_counter(list_, counter_toks, istrain=False): | |
| for sentence in list_: | |
| tokens = nltk.tokenize.word_tokenize(sentence) | |
| if istrain: | |
| counter_toks.update(tokens) | |
| def build_vocab_recipe1m(args): | |
| print ("Loading data...") | |
| dets = json.load(open(os.path.join(args.recipe1m_path, 'det_ingrs.json'), 'r')) | |
| replace_dict_ingrs = {'and': ['&', "'n"], '': ['%', ',', '.', '#', '[', ']', '!', '?']} | |
| replace_dict_instrs = {'and': ['&', "'n"], '': ['#', '[', ']']} | |
| idx2ind = {} | |
| for i, entry in enumerate(dets): | |
| idx2ind[entry['id']] = i | |
| ingrs_file = args.save_path + 'allingrs_count.pkl' | |
| instrs_file = args.save_path + 'allwords_count.pkl' | |
| # manually add missing entries for better clustering | |
| base_words = ['peppers', 'tomato', 'spinach_leaves', 'turkey_breast', 'lettuce_leaf', | |
| 'chicken_thighs', 'milk_powder', 'bread_crumbs', 'onion_flakes', | |
| 'red_pepper', 'pepper_flakes', 'juice_concentrate', 'cracker_crumbs', 'hot_chili', | |
| 'seasoning_mix', 'dill_weed', 'pepper_sauce', 'sprouts', 'cooking_spray', 'cheese_blend', | |
| 'basil_leaves', 'pineapple_chunks', 'marshmallow', 'chile_powder', | |
| 'cheese_blend', 'corn_kernels', 'tomato_sauce', 'chickens', 'cracker_crust', | |
| 'lemonade_concentrate', 'red_chili', 'mushroom_caps', 'mushroom_cap', 'breaded_chicken', | |
| 'frozen_pineapple', 'pineapple_chunks', 'seasoning_mix', 'seaweed', 'onion_flakes', | |
| 'bouillon_granules', 'lettuce_leaf', 'stuffing_mix', 'parsley_flakes', 'chicken_breast', | |
| 'basil_leaves', 'baguettes', 'green_tea', 'peanut_butter', 'green_onion', 'fresh_cilantro', | |
| 'breaded_chicken', 'hot_pepper', 'dried_lavender', 'white_chocolate', | |
| 'dill_weed', 'cake_mix', 'cheese_spread', 'turkey_breast', 'chucken_thighs', 'basil_leaves', | |
| 'mandarin_orange', 'laurel', 'cabbage_head', 'pistachio', 'cheese_dip', | |
| 'thyme_leave', 'boneless_pork', 'red_pepper', 'onion_dip', 'skinless_chicken', 'dark_chocolate', | |
| 'canned_corn', 'muffin', 'cracker_crust', 'bread_crumbs', 'frozen_broccoli', | |
| 'philadelphia', 'cracker_crust', 'chicken_breast'] | |
| for base_word in base_words: | |
| if base_word not in counter_ingrs.keys(): | |
| counter_ingrs[base_word] = 1 | |
| counter_ingrs, cluster_ingrs = cluster_ingredients(counter_ingrs) | |
| counter_ingrs, cluster_ingrs = remove_plurals(counter_ingrs, cluster_ingrs) | |
| # If the word frequency is less than 'threshold', then the word is discarded. | |
| words = [word for word, cnt in counter_toks.items() if cnt >= args.threshold_words] | |
| ingrs = {word: cnt for word, cnt in counter_ingrs.items() if cnt >= args.threshold_ingrs} | |
| def main(args): | |
| vocab_ingrs, vocab_toks, dataset = build_vocab_recipe1m(args) | |
| with open(os.path.join(args.save_path, args.suff+'recipe1m_vocab_ingrs.pkl'), 'wb') as f: | |
| pickle.dump(vocab_ingrs, f) | |
| with open(os.path.join(args.save_path, args.suff+'recipe1m_vocab_toks.pkl'), 'wb') as f: | |
| pickle.dump(vocab_toks, f) | |
| for split in dataset.keys(): | |
| with open(os.path.join(args.save_path, args.suff+'recipe1m_' + split + '.pkl'), 'wb') as f: | |
| pickle.dump(dataset[split], f) | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--recipe1m_path', type=str, | |
| default='path/to/recipe1m', | |
| help='recipe1m path') | |
| parser.add_argument('--save_path', type=str, default='../data/', | |
| help='path for saving vocabulary wrapper') | |
| parser.add_argument('--suff', type=str, default='') | |
| parser.add_argument('--threshold_ingrs', type=int, default=10, | |
| help='minimum ingr count threshold') | |
| parser.add_argument('--threshold_words', type=int, default=10, | |
| help='minimum word count threshold') | |
| parser.add_argument('--maxnuminstrs', type=int, default=20, | |
| help='max number of instructions (sentences)') | |
| parser.add_argument('--maxnumingrs', type=int, default=20, | |
| help='max number of ingredients') | |
| parser.add_argument('--minnuminstrs', type=int, default=2, | |
| help='max number of instructions (sentences)') | |
| parser.add_argument('--minnumingrs', type=int, default=2, | |
| help='max number of ingredients') | |
| parser.add_argument('--minnumwords', type=int, default=20, | |
| help='minimum number of characters in recipe') | |
| parser.add_argument('--forcegen', dest='forcegen', action='store_true') | |
| parser.set_defaults(forcegen=False) | |
| args = parser.parse_args() | |
| main(args) | |