-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_html.py
120 lines (106 loc) · 3.18 KB
/
parse_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, time
from bs4 import BeautifulSoup
def parse_wook(url):
book_data = {}
try:
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
the_page = response.read()
#print the_page
except Exception as detail:
print("Error in parseWook ", detail)
book_data['pu_title']='Error in parseWok'
book_data['pu_sinopse']=str(detail)
return True, book_data
soup = BeautifulSoup(the_page, "lxml")
toto = soup.find('div', {'id': 'productPageSectionDetails-collapseDetalhes-content-title'})
book_data['pu_title'] = get_data_wook(toto)
toto = soup.find('div', {'id': 'productPageSectionDetails-collapseDetalhes-content-author'})
book_data['pu_author'] = get_autor(toto)
toto = soup.find('div', {'id': 'productPageSectionAboutBook-sinopse'})
a = str(get_data_wook(toto,'cr"', '</di'))
tag = 'itemprop="description"'
p1 = a.find(tag) + len(tag)
a = a.replace('<p>','')
a = a.replace('</p>','')
a = a.replace('<i>','')
a = a.replace('</i>','')
a = a.replace('<br>','')
a = a.replace('\n','')
a = a.replace('br/','')
a = a.replace('<>','')
a = a.replace('>','')
book_data['pu_sinopse'] = a[p1:]
cdata = soup.find('div', {'class': 'data'})
book_data.update(get_data(cdata))
return True, book_data
def get_data_wook(elemento,rightD = '">', leftD = '</d'):
elemento = str(elemento)
foo = elemento.find(rightD)
foo1 = elemento.find(leftD)
return elemento[foo+len(rightD):foo1]
def get_autor(a):
a = str(a)
b = a.rfind('">')+2
c = a.rfind('</a>')
return a[b:c]
def get_desc(a):
a = str(a).encode('utf-8')
b = a.rfind('">')
c = a.rfind('</p')
a = a[b:c]
hl = a.replace('<br />','')
hl = hl.replace('</b>','')
hl = hl.replace('<b>','')
hl = hl.replace('<p>','')
hl = hl.replace('>','')
return hl
def get_img(a):
a = str(a)
b = a.find(' src="') + 6
c = a.rfind('&')
return a[b:c]
def get_data(a):
hl = {}
a = str(a)
p1 = 0
p2 = 0
tag = 'itemprop="isbn">'
p1 = a.find(tag)
p2 = a.find('<',p1) #a[p1: len(tag)]
hl['isbn'] = a[p1+len(tag): p2]
p1 = p2
tag = 'itemprop="datePublished">'
p1= a.find(tag,p1)
p2 = a.find('<', p1)
hl['pu_ed_date'] = a[p1 + len(tag): p2]
#
p1 = p2
tag = 'Editor: <span class="info" itemprop="name">'
p1 = a.find(tag, p1) + len(tag)
p2 = a.find('<', p1)
hl['pu_editor'] = a[p1 : p2]
#
p1 = p2
tag = 'Dimensões: <span class="info">'
p1 = a.find(tag, p1) + len(tag)
p2 = a.find('<', p1)
hl['size'] = a[p1: p2].strip().replace('mm','').replace(' ','')
#
p1 = p2
tag = 'Encadernação: <span class="info">'
p1 = a.find(tag, p1) + len(tag)
p2 = a.find('<', p1)
hl['pu_pages'] = a[p1: p2]
#
p1 = p2
tag = 'itemprop="numberOfPages">'
p1 = a.find(tag, p1) + len(tag)
p2 = a.find('<', p1)
hl['pu_pages'] = a[p1: p2]
return hl
def get_ano(a):
return str(a)
if __name__ == '__main__':
pass