-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_working_data.py
143 lines (109 loc) · 4.78 KB
/
generate_working_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Python 3.x
#
import numpy as np
import pandas as pd
import okctools
def function1(df):
"""DataFrame -> DataFrame
Take a DataFrame generated by the OKCScraper and transform some part of it
to make it better suited for analysis.
"""
pass
def remove_empty_profiles(df):
"""DataFrame -> None (Operates inplace on DataFrame object)
Remove empty profiles from OKC profiles DataFrame. This is straightforward
because empty profiles will only contain a username and nothing else.
"""
df.drop(index=df.index[df.name.isnull().values], inplace=True)
df.reset_index(drop=True, inplace=True)
def create_wordcount(df):
"""DataFrame -> DataFrame
Use the 'essays' column of an OKC profiles DataFrame to create a new
column of word counts.
"""
if 'essays' not in df.columns:
print('Input DataFrame does not contain column "essays".')
return df
wordcounts = list(map(lambda essay: okctools.count_words(essay),
df.loc[:, 'essays']))
# "df.loc[:, wordcount] = stuff" gives a warning
df = df.assign(wordcount=wordcounts)
return df
def lists_to_dummies(df, drop=False):
"""DataFrame, bool -> DataFrame
Turn list values into dummy variables. For example, df.orientation[0] might
be the list ['pansexual','queer'], which would be turned into the dummy
variables orientation_pansexual and orientation_queer
TODO:
add arg to specify exactly which columns to dummify
"""
# Loops for all columns whose values are lists, excluding 'essays'
for col in df.drop(columns='essays').columns:
if all(df[col].isnull()):
print('Column "{}" has no valid values. You can remove these with '
'the "remove_empty_columns" function'
.format(col))
continue
elif type(df[col][df[col].notna()].iloc[0]) == list:
df_dummies = df.loc[df[col].notna(), col] \
.apply(lambda lst: pd.Series([1] * len(lst), index=lst)
.add_prefix(col + '_')) \
.fillna(0, downcast='infer')
df_dummies.reindex(index=df.index, fill_value=0)
df = pd.concat([df, df_dummies], axis=1, sort=False)
if drop:
df.drop(columns=col, inplace=True)
return df
def remove_empty_columns(df):
"""DataFrame -> None (Operates inplace on DataFrame object)
Remove columns in DataFrame that have no valid values (i.e. all NaN)
"""
emptylist = []
for col in df.columns:
if all(df[col].isnull()):
emptylist.append(col)
# Do not drop in place, as that will modify the actual df used as input.
df.drop(columns=emptylist, inplace=True)
def repair_lang_features(df):
"""DataFrame -> None (Operates inplace on DataFrame object)
Fix lang_primary and lang_secondary fields from data collected using older
versions of fetchusers.py that resulted in stray words being collected into
the data such as 'very' and 'it'
"""
df.lang_primary.fillna(value=np.nan, inplace=True)
df.lang_secondary.fillna(value=np.nan, inplace=True)
# List of languages directly from OKC website
langs = open('okc_lang_options.txt', 'r').read().lower().splitlines()
lang_columns = ['lang_primary', 'lang_secondary']
for column in lang_columns:
lang_col_new = []
for langlist in df[column]:
langlist_new = []
langset = set() # To check and avoid duplicates
if langlist is np.nan:
lang_col_new.append(np.nan)
continue
if 'c' in langlist:
langlist_new.append('c++')
if 'sign' in langlist:
langlist_new.append('sign language')
for lang in langlist:
if lang in langs and lang not in langset:
langlist_new.append(lang)
langset.add(lang)
lang_col_new.append(langlist_new)
df[column] = lang_col_new
if __name__ == "__main__":
infolder = 'data/input/'
outfolder = 'data/working/'
df = okctools.load_profiles_df(version='py3')
remove_empty_profiles(df)
remove_empty_columns(df)
repair_lang_features(df)
#df = lists_to_dummies(df)
# See the notebook for details on the "why" of what I'm doing here.
df.loc[502, 'status'] = 'single' # Manual inference
df.loc[203, 'gender'] = ['woman'] # Manual inference, simplistic assumption
replace_empty_list = ['lang_primary', 'lang_secondary', 'ethnicity', '']
leave_alone = ['dogs', 'cats', 'height', 'smokes', 'build', 'drinks',
'build', 'education', 'ed_prefix', '']