-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcrawl_yelp.py
36 lines (32 loc) · 886 Bytes
/
crawl_yelp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import requests
from bs4 import BeautifulSoup
base_url = "https://www.yelp.com/search?find_desc=Restaurants&find_loc={}"
city = "los angeles"
url = base_url.format(city)
with open('yelp_20_pages.txt', 'w') as file:
count = 0
all_links = []
start = 0
for i in range(100):
url += '&start=' + str(start)
response = requests.get(url)
start += 30
if start == 540:
break
print(f"STATUS CODE: {response.status_code} FOR {response.url}")
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.findAll('a')
for link in links:
if response.status_code == 200:
href = link.get('href', '')
if 'https:' in href:
if href not in all_links:
all_links.append(href)
print(href)
file.write(href + '\n\n')
count += 1
else:
print('ALREADY EXISTS...')
else:
print('404 -- SKIPPING')
print(f"{count} REQUESTED...")