-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparse_data.py
131 lines (118 loc) · 5.44 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
This is a positively monstrous (i.e. bad) script which turns course evaluation data from text files into Python objects. Format is like so:
data{
'seasonYEAR':[ #really this is just the file name of the data
{
'coursename':'SUBJ999',
'section':'probably blank',
'instructor':'some guy',
'count':'number of responses as a string
'course_fit': string which is one of:
('major_required', 'tech_in_major',
'core_required', 'minor_option',
'open_elective'),
'standing': string which is one of:
('freshman', 'sophomore', 'junior',
'senior', 'graduate'),
'pace': string which is one of:
('very_fast', 'rather_fast', 'moderate',
'rather_slow', 'very_slow'),
'work_load': string which is one of:
('very_heavy', 'rather_heavy', 'moderate',
'rather_light', 'very_light'),
# Then I got lazy and started just referring to question number
5: string which is one of:
('SA', 'A', 'M', 'D', 'SD', 'NA')
which actually makes very little sense for questions 17-20, which
actually ask for ('E', 'VG' 'G', 'F', 'P'),
6: same kind of string,
...
20: same kind of string
}
]
}
"""
import re, cPickle
data = {
'fall1997':[],
'fall1998':[],
'fall1999':[],
'fall2000':[],
'fall2001':[],
'fall2002':[],
'fall2003':[],
'fall2004':[],
'fall2005':[],
'fall2006':[],
'spring1998':[],
'spring1999':[],
'spring2000':[],
'spring2001':[],
'spring2002':[],
'spring2003':[],
'spring2004':[],
'spring2005':[],
'spring2006':[],
}
course_name = re.compile(r'^\* COURSE NAME\s+:\s+(?P<coursename>\w+)\s+SECTION:(?P<section>.+)$')
instructor = re.compile(r'^\* INSTRUCTOR\s+:\s+(?P<instructor>.+)$')
course_fit = re.compile(r'^\s+1\..+')
course_fit_opts = re.compile(r'^MAJOR REQUIRED\s*: (?P<major_required>\w+)%?\s+TECH IN MAJOR\s*: (?P<tech_in_major>\w+)%?\s+CORE REQUIRED\s*: (?P<core_required>\w+)%?\s+MINOR OPTION\s*: (?P<minor_option>\w+)%?\s+OPEN ELECTIVE\s*: (?P<open_elective>\w+)%?.*$')
standing = re.compile(r'^\s+2\..+')
standing_opts = re.compile(r'^FRESHMAN\s*:\s+(?P<freshman>\w+)%?\s+SOPHOMORE\s*:\s+(?P<sophomore>\w+)%?\s+JUNIOR\s*:\s+(?P<junior>\w+)%?\s+SENIOR\s*:\s+(?P<senior>\w+)%?\s+GRADUATE\s*:\s+(?P<graduate>\w+)%?.*$')
pace = re.compile(r'^\s+3\..+')
pace_opts = re.compile(r'^VERY FAST\s*:\s+(?P<very_fast>\w+)%?\s+RATHER FAST\s*:\s+(?P<rather_fast>\w+)%?\s+MODERATE\s*:\s+(?P<moderate>\w+)%?\s+RATHER SLOW\s*:\s+(?P<rather_slow>\w+)%?\s+VERY SLOW\s*:\s+(?P<very_slow>\w+)%?.*$')
work_load = re.compile(r'^\s+4\..+')
work_load_opts = re.compile(r'^VERY HEAVY\s*:\s+(?P<very_heavy>\w+)%?\s+RATHER HEAVY\s*:\s+(?P<rather_heavy>\w+)%?\s+MODERATE\s*:\s+(?P<moderate>\w+)%?\s+RATHER LIGHT\s*:\s+(?P<rather_light>\w+)%?\s+VERY LIGHT\s*:\s+(?P<very_light>\w+)%?.*$')
form_count = re.compile(r'^\*\* COURSE FORM COUNT :\s+(?P<count>\d+).*$')
the_rest = re.compile(r'^\s?(?P<number>\d+)\. \D+(?P<SA>\d+)\s+(?P<A>\d+)\s+(?P<M>\d+)\s+(?P<D>\d+)\s+(?P<SD>\d+)\s+(?P<NA>\d+)\s+$')
def opt_parse(this_course, f, opt_regex, groups):
lines = ' '.join([f.next().strip(), f.next().strip()])
opts = opt_regex.match(lines)
return_dict = {}
for g in groups:
return_dict[g] = int(opts.group(g))
return return_dict
for path in data.keys():
with open('data/%s.txt' % path, 'r') as f:
for line in f:
m = course_name.match(line)
if m:
this_course = {
'coursename': m.group('coursename').strip(),
'section': m.group('section').strip()
}
data[path].append(this_course)
continue
m = instructor.match(line)
if m:
this_course['instructor'] = m.group('instructor').strip()
continue
if course_fit.match(line):
this_course['course_fit'] = opt_parse(this_course, f, course_fit_opts,
['major_required', 'tech_in_major',
'core_required', 'minor_option', 'open_elective'])
continue
if standing.match(line):
this_course['standing'] = opt_parse(this_course, f, standing_opts,
['freshman', 'sophomore', 'junior', 'senior', 'graduate'])
continue
if pace.match(line):
this_course['pace'] = opt_parse(this_course, f, pace_opts,
['very_fast', 'rather_fast', 'moderate', 'rather_slow', 'very_slow'])
continue
if work_load.match(line):
this_course['work_load'] = opt_parse(this_course, f, work_load_opts,
['very_heavy', 'rather_heavy', 'moderate', 'rather_light', 'very_light'])
continue
m = form_count.match(line)
if m:
this_course['form_count'] = int(m.group('count'))
m = the_rest.match(line)
if m:
n = int(m.group('number'))
this_course[n] = {}
for r in ['SA', 'A', 'M', 'D', 'SD', 'NA']:
this_course[n][r] = int(m.group(r))
with open('out.pickle', 'w') as f:
cPickle.dump(data, f)