-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02_pull_non_shooting_articles.py
78 lines (66 loc) · 2.47 KB
/
02_pull_non_shooting_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from datetime import date
from googlesearch import search
from bs4 import BeautifulSoup
from bs4.element import Comment
import pandas as pd
import urllib.request
import requests
import json, datetime, re, time, os, glob
def get_query(row):
day = datetime.datetime.strptime(row['date'], '%Y-%m-%d')
day_plus_1 = day+datetime.timedelta(days=1)
site = row['url'].replace("http://", "").replace("https://", "").split('/')[0]
query = "news "
query += " ".join(['-"%s"'% word for word in row["name"].split()])
query += " site:%s after:%s before:%s" % (site,day.strftime('%Y-%m-%d'),day_plus_1.strftime('%Y-%m-%d'))
return query
def save_url(url, out_fn):
directory = os.path.dirname(out_fn)
if not os.path.exists(directory):
os.makedirs(directory)
try:
web = requests.get(url,timeout=10)
except:
print('\trequest error')
return
if web.status_code != 200:
print('\t'+str(web.status_code), 'error')
return
html = web.content
soup = BeautifulSoup(html, 'html.parser')
with open(out_fn, 'w', encoding='utf-8') as outfile:
outfile.write('\tURL:\t' + url + '\n')
try:
p = re.compile(r'^(\d+) bytes$')
el = soup.find(text=p)
size = p.match(el.string).group(1)
if size > 1000000:
outfile.write('too large')
return
except:
pass
outfile.write(str(soup))
def main():
shootings = pd.read_csv('data/prepared/shootings/shooting_frames.csv')
polarized = shootings[(shootings['leaning']==0) | (shootings['leaning']==2)].copy()
polarized['TAG'] = ['%s_%s' % (row['id'], row['page_num']) for _, row in polarized.iterrows()]
for _, row in polarized.iterrows():
out_fn = "data/raw/no-shootings-control/%s.html" % row['TAG']
if os.path.exists(out_fn):
print('already saved', out_fn)
continue
print(row['url'])
print(row['name'])
print(get_query(row))
print()
try:
for result in search(get_query(row), num_results=1):
try:
save_url(result, out_fn)
except Exception as e:
print("error", e)
time.sleep(1)
except Exception as e:
print("stopped for error [%s]. starting again in 20 minutes" % e)
time.sleep(1200)
main()