-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
231 lines (178 loc) · 7.21 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import urllib.request
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import os, sys
"""Website to scrape: http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases
Data format: Excel
Fields: Disease name - Image link - Origin - See if you can identify the pest - Check what can legally come into Australia - Secure any suspect specimens
Output data:
- Submit the extracted Excel data
- Submit your code
Bonus points:
- Download the images programmatically and link them in the Excel sheet locally.
- Host the data back as a web page using the data from excel. """
"""Take url as argument and returns BeautifulSoup."""
def get_soup(url):
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36'}
response = requests.get(url, headers=user_agent)
soup = bs(response.content, "lxml")
return soup
class Write(object):
"""Writes the data to excel and creates webpages for every disease"""
def __init__(self, disease_name, local_images,origin,identify_the_pest,legally_come_into_australia, suspect_specimens, image_links):
self.disease_name = disease_name
self.local_images = local_images
self.origin = origin
self.identify_the_pest = identify_the_pest
self.legally_come_into_australia = legally_come_into_australia
self.suspect_specimens = suspect_specimens
self.image_links = image_links
def to_excel(self):
df = pd.DataFrame()
df['disease_name'] = self.disease_name
df['origin'] = self.origin
df['local_images'] = self.local_images
df['image_links'] = self.image_links
df['identify_the_pest'] = self.identify_the_pest
df['legally_come_into_australia'] = self.legally_come_into_australia
df['suspect_specimens'] = self.suspect_specimens
writer = pd.ExcelWriter('leancrop.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()
def to_html(self):
df = pd.read_excel('leancrop.xlsx')
"""
df['disease_name']
df['image_links']
df['origin']
df['identify_the_pest']
df['legally_come_into_australia']
df['suspect_specimens']
"""
for i in range(len(df.index)):
f = open(df['image_links'][i].split('/')[-1].split('.jpg')[0]+'.html','w+')
html = """
<html>
<head></head>
<body>
<h1>"""+df['disease_name'][i]+"""</h1>
<img src="""+df['local_images'][i]+""">
</br></br>
<strong>origin:</strong>"""+df['origin'][i]+"""
</br></br>
<strong>see if you can identify the pest</strong>
<p>"""+df['identify_the_pest'][i]+"""</p>
</br></br>
<strong>Check what can legally come into australia</strong>
<p>"""+df['legally_come_into_australia'][i]+"""</p>
</br></br>
<strong>secure any suspect specimens</strong>
<p>"""+df['suspect_specimens'][i]+"""</p>
</body>
</html>"""
f.write(html)
f.close()
class Scrape(object):
"""fethes every field"""
def __init__(self, soup):
self.soup = soup
def links(self):
homepage_url = "http://www.agriculture.gov.au"
atags = self.soup.find('ul', class_="flex-container").find_all('a')
links = [homepage_url+atag['href'] for atag in atags if atag['href'].startswith('/')]
return links
def image(self):
homepage_url = "http://www.agriculture.gov.au"
try:
image_url= homepage_url + self.soup.find('div', class_="pest-header-image").find('img')['src']
urllib.request.urlretrieve(image_url, image_url.split('/')[-1])
except:
try:
image_url = homepage_url + self.soup.find('div', id="content_div_2393636").find('img')['src']
urllib.request.urlretrieve(image_url, image_url.split('/')[-1])
except:
image_url = '/no image'
return os.getcwd()+'/'+image_url.split('/')[-1],image_url
def origin(self):
try:
origin = [strong.next_sibling for strong in self.soup.find('div', class_="pest-header-content").find_all('strong') if 'Origin' in strong.text]
except:
origin = ['']
return origin[0]
def disease_name(self):
try:
disease_name = self.soup.find('div', class_="pest-header-content").find('h2').text
except:
try:
disease_name = self.soup.find('div', class_="page-content full-width").find('h1').text
except:
disease_name = 'no data'
return disease_name
def identify_the_pest(self):
try:
ptags = self.soup.find_all('h3',class_="trigger")[0].find_next('div', class_="hide").find_all('p')
para = ''
for p in ptags:
para +=p.text.strip().replace('\r\n','')
print(para)
except:
para = 'no data'
return para
def legally_come_into_australia(self):
try:
ptags = self.soup.find_all('h3',class_="trigger")[1].find_next('div', class_="hide").find_all('p')
para = ''
for p in ptags:
para +=p.text.strip().replace('\r\n','')
print(para)
except:
para = 'no data'
return para
def suspect_specimens(self):
try:
ptags = self.soup.find_all('h3',class_="trigger")[2].find_next('div', class_="hide").find_all('p')
para = ''
for p in ptags:
para +=p.text.strip().replace('\r\n','')
print(para)
except:
para = ''
return para
def run(url):
local_images = []
origin = []
identify_the_pest = []
legally_come_into_australia = []
suspect_specimens = []
disease_name = []
image_links = []
"""fetch given page then
fetch all links of diseases from that page"""
soup = get_soup(url)
scrape = Scrape(soup)
links = scrape.links()
"""for each disease fetch its name, origin, etc"""
for link in links:
print(link)
soup = get_soup(link)
scrape = Scrape(soup)
if scrape.disease_name() != 'no data':
disease_name.append(scrape.disease_name())
suspect_specimens.append(scrape.suspect_specimens())
image = scrape.image()
local_images.append(image[0])
image_links.append(image[1])
origin.append(scrape.origin())
legally_come_into_australia.append(scrape.legally_come_into_australia())
identify_the_pest.append(scrape.identify_the_pest())
"""write the data(in lists format) to excel(fields) """
write = Write(disease_name, local_images,origin,identify_the_pest,legally_come_into_australia, suspect_specimens, image_links)
write.to_excel()
write.to_html()
def main():
url= "http://www.agriculture.gov.au/pests-diseases-weeds/plant#identify-pests-diseases"
"""driver function of the program"""
run(url)
if __name__ == '__main__':
main()