-
Notifications
You must be signed in to change notification settings - Fork 1
/
naive_bayes.py
128 lines (105 loc) · 5.13 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
import numpy as np
# NLP imports
from nltk.corpus import stopwords
stopwords=stopwords.words('german')
# modeling imports
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import fbeta_score, make_scorer
from utils import modeling as m
from utils import cleaning, transformers
import mlflow
from modeling.config import TRACKING_URI, EXPERIMENT_NAME#, TRACKING_URI_DEV
import logging
from time import time
# set logging
logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s: %(message)s")
logging.getLogger("pyhive").setLevel(logging.CRITICAL) # avoid excessive logs
logger.setLevel(logging.INFO)
if __name__ == "__main__":
data = m.Posts()
scorer = make_scorer(fbeta_score, beta=2)
trans_os = {'translate':[0.9], 'oversample':[0.9]}
TARGET_LABELS = ['label_discriminating', 'label_inappropriate',
'label_sentimentnegative', 'label_needsmoderation']
embedding_dict_glove = transformers.load_embedding_vectors(embedding_style='glove', file="./embeddings/glove_vectors.txt")
embedding_dict_word2vec = transformers.load_embedding_vectors(embedding_style='word2vec', file="./embeddings/word2vec_vectors.txt")
preps = {
'norm': lambda x: cleaning.series_apply_chaining(x, [cleaning.normalize]),
'glove': transformers.MeanEmbeddingVectorizer(embedding_dict=embedding_dict_glove).transform,
'word2vec': transformers.MeanEmbeddingVectorizer(embedding_dict=embedding_dict_word2vec).transform,
}
vecs = {
'count': CountVectorizer(),
'tfidf': TfidfVectorizer(),
}
PRE_VEC_COMBINATIONS = [
['glove', 'glove'],
['word2vec', 'word2vec'],
['norm', 'count'],
['norm', 'tfidf'],
]
for method, strat in trans_os.items():
for strategy in strat:
print(method, strategy)
for label in TARGET_LABELS:
for c in PRE_VEC_COMBINATIONS:
mlflow_params=dict()
print(c)
constant_preprocessor=preps[c[0]]
if c[1] in ['count', 'tfidf']:
pipeline = Pipeline([
("vectorizer", vecs[c[1]]),
("clf", MultinomialNB()),
])
param_grid = {
"vectorizer__ngram_range" : [(1,1), (1,2), (1,3)],
"vectorizer__stop_words" : [stopwords, None],
"vectorizer__min_df": [0.],
"vectorizer__max_df": [0.9],
}
grid_search_params = param_grid.copy()
# MLFlow params have limited characters, therefore stopwords must not be given as list
grid_search_params["vectorizer__stop_words"] = ["NLTK-German", None]
mlflow_params["normalization"] = c[0]
mlflow_params["vectorizer"] = c[1]
else:
pipeline = Pipeline([
('scaler', MinMaxScaler()),
("clf", MultinomialNB()),
])
param_grid = {
"clf__alpha" : [1.0],
}
grid_search_params = param_grid.copy()
mlflow_params["normalization"] = 'norm'
mlflow_params["vectorizer"] = c[1]
gs = GridSearchCV(pipeline, param_grid, scoring=scorer, cv=5, verbose=1, n_jobs=-1)
mlflow_params["model"]= "NaiveBayes"
mlflow_params["grid_search_params"]= str(grid_search_params)[:249]
mlflow_tags = {
"cycle4": True,
}
IS_DEVELOPMENT = False
mlflow_logger = m.MLFlowLogger(
uri=TRACKING_URI,
experiment=EXPERIMENT_NAME,
is_dev=IS_DEVELOPMENT,
params=mlflow_params,
tags=mlflow_tags
)
training = m.Modeling(data, gs, mlflow_logger)
logger.info(f"-"*20)
logger.info(f"Target: {label}")
data.set_label(label=label)
data.set_balance_method(balance_method=method, sampling_strategy=strategy)
training.train(constant_preprocessor=constant_preprocessor)
training.evaluate(["train", "val"],constant_preprocessor=constant_preprocessor)
#if True:
with mlflow.start_run(run_name='naivebayes_with_fbeta') as run:
mlflow_logger.log()