-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIngredientsDictBuilder.py
125 lines (95 loc) · 4.59 KB
/
IngredientsDictBuilder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os.path
from os import path
import json
import urllib.request
from bs4 import BeautifulSoup
from RecipeInfo import RecipeInfo
from helpers import lst_of_urls
from collections import OrderedDict
def loadIngredientsDict():
if path.exists("IngredientsDict.json"):
return json.load(open("IngredientsDict.json", "r"))
else:
return {}
def saveIngredientsDict():
out_file = open("IngredientsDict.json", "w")
json.dump(ID, out_file)
ID = loadIngredientsDict()
def addIngredient(name, properties=[]):
if name not in ID:
ID[name] = {
'tally': 1
}
else:
ID[name]['tally'] += 1
for property in properties:
if property not in ID[name]:
ID[name][property] = 1
else:
ID[name][property] += 1
saveIngredientsDict()
def analyzeIngredientsInRecipes(urls_list, properties=[]):
for i, url in enumerate(urls_list):
print(f"Analyzed ingredients in {i+1}/{len(urls_list)} urls.", end='\r')
rcp = RecipeInfo(url)
for ing in rcp.Ingredients:
addIngredient(ing.name, properties)
print("\nFinished analyzing urls.")
def getListOfRecipeUrls(main_url, numPages, startingPage=1):
startingPage = startingPage - 1
recipe_urls = []
for page in range(numPages):
print(f"Fetched recipe urls from {page+1}/{numPages} pages.", end='\r')
try:
fetch_response = urllib.request.urlopen(main_url + f"?page={2 + startingPage + page}")
html_doc = (fetch_response.read()).decode("utf-8")
parser = BeautifulSoup(html_doc, 'html.parser')
for imageLink in parser.find_all("a", {"class":"tout__imageLink"}):
if "/recipe/" in imageLink.attrs['href']:
recipe_urls.append("https://www.allrecipes.com" + imageLink.attrs['href'])
except:
print("\nInvalid url. Continuing...")
continue
print("\nFinished fetching recipe urls.")
print(f"Ending page: {startingPage + numPages}")
return recipe_urls
def filterDictBy(a_dict, included_properties, excluded_properties=[]):
main_set = set(['tally'] + included_properties)
return { ing:val for ing,val in OrderedDict(a_dict).items() if main_set.issubset(set(a_dict[ing].keys())) and (exc not in a_dict[ing].keys() for exc in excluded_properties) }
def sortDictBy(sub_dict, property):
return OrderedDict(sorted(sub_dict.items(), key=lambda x: (x[1][property]**1.3)/(x[1]['tally']), reverse=True))
def filterAndSortDictBy(a_dict, property):
return sortDictBy(filterDictBy(a_dict, [property]), property)
print('wow')
# Filter/Sort examples
# filterAndSortDictBy(ID, 'meat')
# filterDictBy(filterAndSortDictBy(ID, 'healthy'), ['mexican'])
# filterDictBy(filterAndSortDictBy(ID, 'healthy'), ['healthy'], ['meat'])
# import time
# start = time.time()
# To continue getting more recipes from a certain category, change the starting page to the end page so far + 1
# I.e., if a category has scanned 8 pages of reipes so far and you want to scan 3 more, the new starting page would
# be 9. Make sure to update the new ending page to be the new starting page + number of pages scanned.
# # End page so far: 8
# asian_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/227/world-cuisine/asian/", 8, 1)
# analyzeIngredientsInRecipes(asian_rcps, ['asian'])
# # End page so far: 8
# italian_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/723/world-cuisine/european/italian/", 8, 1)
# analyzeIngredientsInRecipes(italian_rcps, ['italian'])
# # End page so far: 8
# veg_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/87/everyday-cooking/vegetarian/", 8, 1)
# analyzeIngredientsInRecipes(veg_rcps, ['vegetarian'])
# # End page so far: 8
# seafood_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/93/seafood/", 8, 1)
# analyzeIngredientsInRecipes(seafood_rcps, ['seafood'])
# # End page so far: 8
# meat_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/92/meat-and-poultry/", 8, 1)
# analyzeIngredientsInRecipes(meat_rcps, ['meat'])
# # End page so far: 8
# healthy_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/84/healthy-recipes/", 8, 1)
# analyzeIngredientsInRecipes(healthy_rcps, ['healthy'])
# # End page so far: 8
# mexican_rcps = getListOfRecipeUrls("https://www.allrecipes.com/recipes/728/world-cuisine/latin-american/mexican/", 8, 1)
# analyzeIngredientsInRecipes(mexican_rcps, ['mexican'])
# print(f"Total extraction took {time.time() - start} seconds.")
# print(f"Each category takes about {(time.time() - start)/7} seconds on average.")