-
Notifications
You must be signed in to change notification settings - Fork 0
/
profileparser.py
217 lines (186 loc) · 8.05 KB
/
profileparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import regex
import pandas as pd
# maybe these should be pulled from a csv
basicskws = {'orientation': ['straight', 'gay', 'bisexual', 'asexual',
'demisexual', 'heteroflexible', 'homoflexible',
'lesbian', 'pansexual', 'queer', 'questioning',
'sapiosexual'],
'gender': ['woman', 'man', 'agender', 'androgynous',
'bigender', 'cis man', 'cis woman',
'genderfluid', 'genderqueer',
'gender nonconforming', 'hijra', 'intersex',
'binary', 'other', 'pangender', 'transfeminine',
'transgender', 'transmasculine', 'transsexual',
'trans man', 'trans woman'],
'status': ['single', 'seeing', 'married', 'open'],
'monogamous': ['monogamous', u'non\u2011monogamous'],
'build': ['rather', 'thin', 'overweight', 'average', 'fit',
'jacked', 'extra', 'full', 'curvy', 'used'],
}
backgroundkws = {'ethnicity': ['asian', 'black', 'hispanic / latin', 'indian',
'middle eastern', 'native american',
'pacific islander', 'white', 'other ethnicity',
'multi-ethnic'],
'ed_prefix': ['attended', 'working on', 'dropped out of'],
'education': ['high school', 'two-year college',
'university', 'space camp', 'post grad'],
'religion_pre': [u'it\u2019s important', u'not important',
'laughing about it'],
'religion': ['agnostic', 'atheist', 'christian', 'jewish',
'catholic', 'muslim', 'hindu', 'buddhist',
'sikh', 'other religion'],
}
misckws = {'kids_present': [u'doesn\u2019t have kids', 'has kids'],
'kids_future': ['but might want them', 'but wants them',
u'doesn\u2019t want them'],
'dogs': ['dogs'],
'cats': ['cats'],
'sign': ['aquarius', 'pisces', 'aries', 'taurus',
'gemini', 'cancer', 'leo', 'virgo', 'libra',
'scorpio', 'sagittarius', 'capricorn'],
'diet': ['omnivore', 'vegetarian', 'vegan', 'kosher', 'halal']
}
lookingkws = {'lf_gender': ['women', 'men', 'people'],
'lf_status': ['single'],
'lf_rel_type': ['short', 'long', 'hookup', 'friends'],
'lf_monogamous': ['monogamous'],
}
def parse_basics(text):
print(text.encode('utf-8'))
df = pd.DataFrame()
feetinches = regex.findall('\d+', text)
if not any(feetinches):
df['height'] = None
else:
df['height'] = [(int(feetinches[0]) * 12) + int(feetinches[1])]
# Return as lists
listvals = ['orientation', 'gender']
for category in listvals:
values = []
for kw in basicskws[category]:
match = regex.findall(r'\b(%s)\b' % kw, text)
if any(match):
values.append(match[0])
if not any(values):
df[category] = None
else:
df[category] = [values]
# Single values
for category in list(filter(lambda key: key not in listvals,
basicskws.keys())):
values = [kw for kw in basicskws[category] if kw in text]
if not any(values):
df[category] = None
else:
df[category] = values
print(df)
return df
def parse_background(text):
print(text.encode('utf-8'))
df = pd.DataFrame()
# Return as lists
listvals = ['ethnicity']
for category in listvals:
values = []
for kw in backgroundkws[category]:
if kw in text:
values.append(kw)
text = regex.sub(kw, '', text)
if not any(values):
df[category] = None
else:
df[category] = [values]
# Single values
for category in list(filter(lambda key: key not in listvals,
backgroundkws.keys())):
value = [kw for kw in backgroundkws[category] if kw in text]
if not any(value):
df[category] = None
else:
df[category] = value
text = regex.sub(r'%s' % value[0], '', text)
secondary = regex.findall(r'(?<=some\s)(\w+)', text)
if not any(secondary):
df['lang_secondary'] = None
else:
df['lang_secondary'] = [[lang for lang in secondary]]
text = regex.sub(r'some\s\w+|speaks|and|but|it\u2019s', '', text)
primary = regex.findall(r'\w+', text)
if not any(primary):
df['lang_primary'] = None
else:
df['lang_primary'] = [[lang for lang in primary]]
print(df)
return df
def parse_misc(text):
print(text.encode('utf-8'))
df = pd.DataFrame()
# OKC alters the word order and words themselves depending on
# user answers, making it difficult to generate regexes dynamically
# This method isn't as pretty as I would like, but it gets the job done.
categories = ['smokes', 'drinks', 'drugs']
# sometimes uses 'do drugs', sometimes 'does drugs'
terms = ['smokes', 'drinks?', 'doe?s? drugs']
matches = [regex.findall(r'\w+(?= %s)|(?<=%s )\w+'
% (terms[0], terms[0]), text),
regex.findall(r'[\w(?:\u2019)]+(?= %s)|(?<=%s )[\w(?:\u2019)]+'
% (terms[1], terms[1]), text),
regex.findall(r'[\w(?:\u2019)]+(?= %s)|(?<=%s )[\w(?:\u2019)]+'
% (terms[2], terms[2]), text),
]
# standardize and assign values
for category, match in zip(categories, matches):
if not any(match):
df[category] = None
elif (match[0] == 'regularly') | (match[0] == 'often'):
df[category] = ['yes']
elif (match[0] == 'never') | (match[0] == u'doesn\u2019t'):
df[category] = ['no']
else:
df[category] = match
# Get the rest of the values using simple keyword search
for category, kws in misckws.items():
values = [kw for kw in kws if kw in text]
if not any(values):
df[category] = None
else:
df[category] = values
print(df)
return df
def parse_lookingfor(text):
print(text.encode('utf-8'))
df = pd.DataFrame()
# Numerical data
# Possible for user to select 'Anywhere' instead of a number range
dist = regex.findall(r'within (\d+) (\w+)', text)
if any(dist):
if dist[0][1] == 'km':
df['lf_dist'] = [int(int(dist[0][0]) * 0.621371)]
else:
df['lf_dist'] = [int(dist[0][0])]
elif any(regex.findall(r'anywhere', text)):
df['lf_dist'] = [-1]
else:
df['lf_dist'] = None
# Possible for users to select a single 'lookingfor' age
lf_age_lower = regex.findall(r'\d+(?=\u2011)', text)
if any(lf_age_lower):
df['lf_age_lower'] = [int(lf_age_lower[0])]
df['lf_age_upper'] = [int(regex.findall(r'(?<=\u2011)\d+', text)[0])]
else:
lf_age = [int(regex.findall(r'(?<=age )\d+', text)[0])]
df['lf_age_lower'] = lf_age
df['lf_age_upper'] = lf_age
# Can result in a list of strings
df['lf_rel_type'] = [[kw for kw in lookingkws['lf_rel_type'] if kw in text]]
# Single entries. lf_gender needs a custom loop with a break so that 'men'
# doesn't match twice with 'women' and 'men'
for kw in lookingkws['lf_gender']:
if kw in text:
df['lf_gender'] = [kw]
break
df['lf_status'] = [kw if kw in text else None for kw in lookingkws['lf_status']]
df['lf_monogamous'] = [
kw if kw in text else None for kw in lookingkws['lf_monogamous']]
print(df)
return df