-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
91 lines (57 loc) · 2.05 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup
from urllib.request import urlopen
def handleUnitPage(address):
page = urlopen(address)
print("######################################################")
textFile.write("######################################################\n")
i = 0
count = 0
soup = BeautifulSoup(page, 'html.parser')
#############
# unit name
print(soup.find(id="firstHeading").contents[1].get_text())
textFile.write(soup.find(id="firstHeading").contents[1].get_text() + "\n")
name_box = soup.findAll("div", attrs="infobox-cell-2")
##########
# line writer
for name in name_box:
writeResults(name.text, count)
count += 1
def handleBuildingPage(address):
page = urlopen(address)
print("######################################################")
textFile.write("######################################################\n")
i = 0
count = 0
soup = BeautifulSoup(page, 'html.parser')
print(soup.find(id="firstHeading").contents[1].get_text())
textFile.write(soup.find(id="firstHeading").contents[1].get_text() + "\n")
name_box = soup.findAll("div", attrs="infobox-cell-2")
##########
# line writer
for name in name_box:
writeResults(name.text, count)
count += 1
if(count > 3):
break
def writeResults(txt, lineCounter):
txt = txt.strip()
txt = str(lineCounter) + " " + txt + "\n"
textFile.write(txt)
textFile = open("textFile.txt", 'w')
unitUrlList = open("unitUrls.txt", 'r')
buildingUrlList = open("buildingUrls.txt", 'r')
for urlAddress in unitUrlList:
handleUnitPage(urlAddress)
for urlAddress in buildingUrlList:
handleBuildingPage(urlAddress)
print()
print()
print("######################################################")
print("######################################################")
print(" SUCCESS")
print("######################################################")
print("######################################################")
unitUrlList.close()
buildingUrlList.close()
textFile.close()