-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlm_crawler.py
110 lines (96 loc) · 3.37 KB
/
lm_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/python
# -*- coding: utf-8 -*-
import mechanize
import cookielib
from bs4 import BeautifulSoup
import re
#Browser
br = mechanize.Browser()
#Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc25 Firefox/3.0.1')]
def get_urls(alpha):
basic_url = 'http://www.ldoceonline.com/browse/english/'
filename=alpha+".url"
idx_url = basic_url + alpha +'/'
r = br.open(idx_url)
html = r.read()
#print html
#print br.response().read()
soup = BeautifulSoup(html, "html5lib")
innerDiv = soup.find(attrs={'class': "page_content"})
links = innerDiv.find_all("a", href=re.compile("^http"))
for link in links:
if link.find("span"):
#print link
sublink = link['href']
r = br.open(sublink)
html = r.read()
soup = BeautifulSoup(html, "html5lib")
inner_ul = soup.find(attrs={'class': "browse_results"})
word_links = inner_ul.find_all("a", href=re.compile("^http"))
with open (filename,'a') as fd_url:
for link in word_links:
#text=link.find("span")
#print text.string.encode('utf-8').strip()
print>>fd_url, link['href']
def url_filtering(filename):
with open (filename,'r') as fd:
ln = filename.split('.')
fn_filed = ln[0]+'_filtered.url'
with open (fn_filed,'w') as fd_filed:
for ln in fd:
obj = re.match(r'(.*)[-](.*)', ln)
if not obj:
#without newline
print>>fd_filed, ln[:-1]
def get_vocabulary(filename):
with open(filename) as fd:
for ln in fd:
print ln
r = br.open(ln)
html = r.read()
soup = BeautifulSoup(html, "html5lib")
pagetitle=soup.find(attrs={'class':"pagetitle"})
dictlinks=soup.find_all(attrs={'class':"dictlink"})
for dictlink in dictlinks:
freq_hd = dictlink.find(attrs={'class':"frequent Head"})
if freq_hd:
word_level = freq_hd.find(attrs={'class': "tooltip LEVEL"})
word_freq = freq_hd.find_all(attrs={'class': "FREQ"})
#print word_level['title'], word_level.string
word_property=[pagetitle.string]
if word_level:
word_property.append(word_level['title'])
word_property.append(word_level.string)
if word_freq:
for v in word_freq:
word_property.append(v.string)
if len(word_property)>1:
#print '[%s]' % ', '.join(map(encode('utf-8'), word_property))
poses = freq_hd.find_all(attrs={'class':"POS"})
if poses:
for pos in poses:
word_property.append(pos.get_text())
with open("core_vocabulary.lm", 'a') as fd_cv:
print>>fd_cv, ' '.join(p.encode('utf-8').strip() for p in word_property)
if __name__ == '__main__':
alphabet=['r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
for alpha in alphabet:
get_urls(alpha)
url_filtering(alpha+'.url')
get_vocabulary(alpha+'_filtered.url')