-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser.py
53 lines (48 loc) · 1.44 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from traceback import print_exc as traceback_print_exc
from bs4 import BeautifulSoup
from bs4 import element as bs4_element
from .dish import dish
from .menu import menu
from datetime import date
import re
date_rgx = re.compile("\D*(\d*)\.(\d*)\.?")
def parse(data):
soup = BeautifulSoup(data.decode(encoding="latin_1",errors="ignore"))
# extract the relevant table from HTML
table = []
for tr in soup.body.table.select("tr"):
try:
row = []
for td in tr.select("td"):
for sup in td.find_all("sup"):
sup.extract()
for br in td.find_all("br"):
br.extract()
text = td.text.strip()
text = text.replace('\r\n','')
# convert the 'V+' icon to ascii
img = td.img
if img and img['alt'] == "vegan":
text = "V+"
row.append(text)
#if text:
# print(repr(text))
if any(row): #prevent empty rows
table.append(row)
except:
traceback_print_exc()
# walk the table, create dish objects and add them to the menu
themenu = menu()
del table[0] #discard the first row which contains no actual information
# Not only are those goons too daft to present their data in any other form
# than HTML, they aren't even using a table header! m(
cur_date = None
for row in table:
del row[1]
match = date_rgx.match(row[0])
if match:
day, month = match.groups()
cur_date = date(date.today().year,int(month),int(day))
a_dish = dish(cur_date,*row[1:])
themenu.append(cur_date,a_dish)
return themenu