-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
52 lines (35 loc) · 1.07 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import re
import requests
from bs4 import BeautifulSoup
URL = "https://www.epicurious.com/search?content=recipe&sort=newest&page="
MAX = 2507
def scrape_urls():
urls = set()
for i in range(1, MAX + 1):
x = requests.get(URL + str(i))
urls |= set(re.findall(r'/recipes/food/views/[^"]+', x.text))
print(f"page {i}")
with open("outputs/urls", "w") as f:
for x in urls:
f.write(x + "\n")
def main():
scrape_urls()
with open("outputs/urls") as f:
urls = f.readlines()
f = open("data/epi.json", "w")
err = open("outputs/bad-recs", "w")
for i, url in enumerate(urls):
print(f"{i}: {url[:-1]}")
r = requests.get("https://www.epicurious.com" + url[:-1])
soup = BeautifulSoup(r.text, "html.parser")
try:
js = soup.find("script", {"type": "application/ld+json"}).getText()
except:
err.write(url)
print("No json")
continue
f.write(js + "\n")
err.close()
f.close()
if __name__ == "__main__":
main()