-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobparser.py
213 lines (173 loc) · 7.19 KB
/
jobparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
from pymongo import MongoClient, UpdateOne
from pprint import pprint
import nltk
# nltk.download('punkt') # uncomment on first run to get download
# nltk.download('stopwords') # uncomment on first run to get download
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from fastai.tabular.all import *
import pandas as pd
import os
from job_word_lists import *
CONNECTION_STRING = "mongodb://%s:%s@%s:27017/%s" % (os.environ["MONGO_USER"],
os.environ["MONGO_PASS"], os.environ["MONGO_HOST"], os.environ["MONGO_DB_NAME"])
STOP_WORDS = stopwords.words('english')
IGNORED_WORDS = IGNOREABLE_WORDS + STOP_WORDS
SKILLS = set(GOOD_SKILLS + NEUTRAL_SKILLS + BAD_SKILLS)
def occurrence_cutoff(phrase, phrase_count, jobs_count):
if phrase_count == 1:
return False
match len(phrase.split()):
case 1: # 1%
return phrase_count / jobs_count >= 0.01
case 2:
return phrase_count / jobs_count >= 0.01
case 3: # 0.5%
return phrase_count / jobs_count >= 0.005
case 4:
return phrase_count / jobs_count >= 0.005
return False
# Define the regular expression pattern for tokenization
title_pattern = r"\s*[-,()]+\s*|\s*\.\s+|\s*\.$"
# Create a RegexpTokenizer with the defined pattern
TITLE_TOKENIZER = RegexpTokenizer(title_pattern, True)
def parse_job_titles_by_phrases(job_title):
positionTokenSplit = TITLE_TOKENIZER.tokenize(job_title)
phrases = set()
for token in positionTokenSplit:
words = token.lower().split()
words = list(filter(lambda x: x not in IGNORED_WORDS, words))
for i in range(len(words)):
for j in range(1, min(5, len(words) - i + 1)): # Consider phrases of length 1 to 4 words
phrase = ' '.join(words[i:i+j])
phrases.add(phrase)
return phrases
# Define the regular expression pattern for tokenization
post_pattern = r"\s*[-/,();:&]+\s*|\s*[!?.]$"
# Create a RegexpTokenizer with the defined pattern
POST_TOKENIZER = RegexpTokenizer(post_pattern, True)
def parse_job_posts_by_skills(job_post):
parsed_skills = []
sentences = [ # tokenize by sentences and also \n then replace out and trim
sentence.replace(u'\xa0', u' ').strip()
for s in nltk.sent_tokenize(job_post)
for sentence in s.lower().split('\n')
]
for sentence in sentences:
if len(sentence):
words = [w for ws in POST_TOKENIZER.tokenize(sentence) for w in ws.split()]
line_skills = []
for word in words:
if word in SKILLS:
line_skills.append(word)
if line_skills:
parsed_skills.append(line_skills)
return parsed_skills
def count_words(job_post, job_posts_word_count):
sentences = [ # tokenize by sentences and also \n then replace out and trim
sentence.replace(u'\xa0', u' ').strip()
for s in nltk.sent_tokenize(job_post)
for sentence in s.lower().split('\n')
]
for sentence in sentences:
if len(sentence):
words = [w for ws in POST_TOKENIZER.tokenize(sentence) for w in ws.split()]
if SKILLS and not any([word in SKILLS for word in words]):
continue
else:
for word in words:
if word not in STOP_WORDS:
if word in job_posts_word_count:
job_posts_word_count[word] += 1
else:
job_posts_word_count[word] = 1
def retrieve_mongo_jobs():
connection = MongoClient(CONNECTION_STRING)
jobsDB = connection["JobSearchDB"]
jobsCollection = jobsDB["jobs"]
return jobsCollection, jobsCollection.find().sort('datePosted', -1)
def parse_jobs():
jobsColl, jobs = retrieve_mongo_jobs()
ml_phrases = list(set(GOOD_PHRASES + BAD_PHRASES + PERTINENT_PHRASES))
process_data_set = []
data = []
num_conlumns = len(ml_phrases) + 1
# Base case of failing zero phrases matched
for _ in range(20):# average of 4 chosen for training set
data.append([False] * num_conlumns)
ml_phrase_index = {}
for i in range(1, num_conlumns):
ml_phrase_index[ml_phrases[i-1]] = i
phrase_counter = {}
job_posts_word_count = {}
likes = [0,0,0] # dislike, not rated, liked
for job in jobs:
if "liked" in job:
liked = job["liked"]
if liked == True:
likes[2] += 1
liked = 1
else:
likes[0] += 1
liked = -1
else:
likes[1] += 1
liked = 0
phrases = parse_job_titles_by_phrases(job["position"])
row = [False] * num_conlumns
for phrase in phrases:
if phrase in phrase_counter:
phrase_counter[phrase][1+liked] += 1
else:
like_counts = [0,0,0]
like_counts[1+liked] += 1
phrase_counter[phrase] = like_counts
if phrase in ml_phrases:
row[ml_phrase_index[phrase]] = True
if liked:
if liked == 1:
row[0] = True
data.append(row)
elif "titleRanking" not in job:
process_data_set.append(row + [job['_id']])
# TODO: logic to rank skills and title phrases
post_skills = parse_job_posts_by_skills(job["fullJobPost"])
# use for initially finding skill words from job posts
count_words(job["fullJobPost"], job_posts_word_count)
jobs_count = sum(likes)
phrase_counter = list(filter(lambda x: x[1][0] + x[1][2] > 0 and occurrence_cutoff(x[0], sum(x[1]), jobs_count), phrase_counter.items()))
phrase_counter.sort(key=lambda x: sum(x[1]))
# print(list(map(lambda x: x[0], phrase_counter)))
pprint(phrase_counter[-50:])
for skill in SKILLS:
if skill in job_posts_word_count:
del job_posts_word_count[skill]
job_posts_word_count = list(job_posts_word_count.items())
job_posts_word_count.sort(key=lambda x: x[1])
print(len(job_posts_word_count))
pprint(job_posts_word_count[-20:])
dep_var = 'job_liked'
df = pd.DataFrame(data, columns=[dep_var] + ml_phrases)
# Split the data into training and validation sets
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(df))
# Define a tabular data loader
to = TabularPandas(df, procs=[Categorify, FillMissing], cat_names = ml_phrases, y_names=dep_var, splits=splits)
# Define your model
dls = to.dataloaders(bs=64)
# Define and train the model
learn = tabular_learner(dls, metrics=accuracy)
learn.fine_tune(52)
# Evaluate your model on the validation set
learn.show_results()
fdf = pd.DataFrame(process_data_set, columns=[dep_var] + ml_phrases + ['_id'])
dl = learn.dls.test_dl(fdf)
predictions = learn.get_preds(dl=dl)
updates = []
for i, row in fdf.iterrows():
updates.append(UpdateOne(
{'_id': row['_id'], 'titleRanking': {'$exists' : False}},
{'$set': {'titleRanking': float(predictions[0][i][1])}},
))
jobsColl.bulk_write(updates)
if __name__ == "__main__":
parse_jobs()