forked from adamkalman/CDIPS_2015
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplay_with_data.py
31 lines (27 loc) · 977 Bytes
/
play_with_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import csv
import sys
import numpy as np
datafile = 'EP_data.csv'
csv.field_size_limit(sys.maxsize)
def csv_iterator(datafile):
"""Reads 1 row at a time from a csv file, yielding a dict with keys from the header row.
Adds an 'id' field if there's not one."""
rownum = 1
for doc in csv.DictReader(open(datafile, 'rU')):
id = doc.get('id', str(rownum))
doc['id'] = id
rownum += 1
yield doc
if __name__ == "__main__":
post_length_M = []
post_length_F = []
for doc in csv_iterator(datafile):
if doc['gender'] == 'F':
post_length_F.append(len(doc['content']))
elif doc['gender'] == 'M':
post_length_M.append(len(doc['content']))
if int(doc['id']) % 10000 == 0:
print doc['id']
# print doc, '\n'
print "median length of male post", np.median(np.array(post_length_M))
print "median length of female post", np.median(np.array(post_length_F))