-
Notifications
You must be signed in to change notification settings - Fork 0
/
beeradvscrape.py
61 lines (54 loc) · 1.53 KB
/
beeradvscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
'''
(Gently) Scrape BeerAdvocate for their brewery names
from id tags
'''
import pandas as pd
import cPickle
#import MySQLdb as mdb
import pymysql as mdb
import time
from bs4 import BeautifulSoup
import urllib2
import sys
from authent import dbauth as auth
# get brewery ids from database
sql='''
SELECT DISTINCT beer_brewerid
FROM RateBeer
'''
con=mdb.connect(**auth)
print 'Loading brewery ids'
df=pd.io.sql.read_frame(sql,con)
brewerids=list(df['beer_brewerid'])
print 'Found %i brewery ids'%len(brewerids)
# pull info from beeradvocate
baseurl='http://www.beeradvocate.com/beer/profile/'
waittime=2.0 #seconds
# hold brewery names and locations
breweries=[]
locations=[]
count=0
for bid in brewerids:
print 'Finding id# %i'%bid
url=baseurl+str(bid)
try:
req=urllib2.Request(url,headers={'User-Agent':'Magic Browser'})
html=urllib2.urlopen(req).read()
soup=BeautifulSoup(html,'lxml')
brewerinfo=str(soup.find('title'))[7:] #get rid of "<title>" tag
brewername=brewerinfo.split('|')[0]
loc=brewerinfo.split('|')[1]
breweries.append(brewername)
locations.append(loc)
print brewername+' : '+loc
except:
e=sys.exc_info()[0]
print 'Error on id %i (count=%i): %s'%(bid,count,e)
breweries.append('')
locations.append('')
time.sleep(waittime)
count+=1
print ('Done. Saving.')
# save s a dictionary
brewdict={'id':brewerids,'brewer':breweries,'location':locations}
cPickle.dump(brewdict,open('breweries.cpk','w'))