-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_courses.py
64 lines (51 loc) · 2.06 KB
/
parse_courses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
''''
Script to find all of the course codes, then find which ones have video RSS feeds.
The source of Courses.pdf should be the list of all courses (not catalogue data) from VVZ
http://vvz.ethz.ch/Vorlesungsverzeichnis/gesamtverzeichnis.view?lang=en
If the courses pdf has been parsed, then READ_PDF can be set to False.
To find which ones have an RSS feed, set FIND_HITS to True.
The URL should be adjusted to reflect the current year / semester, this should be taken care of by year and season variables below
'''
import PyPDF2
import re, pickle
import feedparser
from course import Course
READ_PDF = False
FIND_HITS = True
Depts = ['d-arch', 'd-baug', 'd-biol', 'd-bsse', 'd-chab', 'd-erdw', 'd-gess', 'd-hest', 'd-infk', 'd-itet', 'd-math', 'd-matl', 'd-mavt', 'd-mtec', 'd-phys', 'd-usys']
year='2021'
season='autumn' #'spring'
if READ_PDF:
pdffile = open('Courses.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdffile)
txt=pdfReader.getPage(pdfReader.numPages-1).extractText()
courses=[]
for page in range(pdfReader.numPages):
txt = pdfReader.getPage(page).extractText()
l=re.findall("[0-9]{3}-[0-9]{4}-[0-9A-Z]{3}", txt)
courses += l
print(l)
#print(courses)
pdffile.close()
courses = list(set(courses))
with open('all_courses.pkl', 'wb') as infile:
pickle.dump(courses, infile)
# regex [0-9]{3}-[0-9]{4}-[0-9A-Z]{3}
if FIND_HITS:
with open('all_courses.pkl', 'rb') as infile:
courses= pickle.load(infile)
hits = {d:[] for d in Depts}
for idx in courses:
entries=[]
for dept in Depts:
r=feedparser.parse(f'https://video.ethz.ch/lectures/{dept}/{year}/{season}/{idx}.rss.xml?quality=HIGH')
entries= r.entries
if entries != []:
try:
hits[dept].append(Course(dept, idx, year, season))
print(idx, dept, entries[0]['subtitle'])
break
except:
pass
with open('rss_courses.pkl', 'wb') as outfile:
pickle.dump(hits, outfile)