forked from gatapia/py_ml_utils
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathXGBoostClassifier.py
121 lines (112 loc) · 3.95 KB
/
XGBoostClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from sklearn.base import BaseEstimator, ClassifierMixin
import pandas as pd
import sys
sys.path.append('lib')
import xgboost as xgb
import numpy as np
class XGBoostClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, silent=True,
use_buffer=True, num_round=10, ntree_limit=0,
nthread=None, booster='gbtree',
eta=0.3, gamma=0.01,
max_depth=6, min_child_weight=1, subsample=1,
colsample_bytree=1,
l=0, alpha=0, lambda_bias=0, objective='reg:linear',
eval_metric=None, seed=0, num_class=None,
max_delta_step=0
):
assert booster in ['gbtree', 'gblinear']
assert objective in ['reg:linear', 'reg:logistic',
'binary:logistic', 'binary:logitraw', 'multi:softmax',
'multi:softprob', 'rank:pairwise']
assert eval_metric in [None, 'rmse', 'mlogloss', 'logloss', 'error',
'merror', 'auc', 'ndcg', 'map', 'ndcg@n', 'map@n']
self.silent = silent
self.use_buffer = use_buffer
self.num_round = num_round
self.ntree_limit = ntree_limit
self.nthread = nthread
self.booster = booster
# Parameter for Tree Booster
self.eta=eta
self.gamma=gamma
self.max_depth=max_depth
self.min_child_weight=min_child_weight
self.subsample=subsample
self.colsample_bytree=colsample_bytree
self.max_delta_step=max_delta_step
# Parameter for Linear Booster
self.l=l
self.alpha=alpha
self.lambda_bias=lambda_bias
# Misc
self.objective=objective
self.eval_metric=eval_metric
self.seed=seed
self.num_class = num_class
def build_matrix(self, X, opt_y=None):
if hasattr(X, 'values'): X = X.values
if opt_y is not None and hasattr(opt_y, 'values'): opt_y = opt_y.values
return X if hasattr(X, 'handle') else xgb.DMatrix(X, opt_y, missing=np.nan)
def cv(self, X, y):
X = self.build_matrix(X, y)
param = {
'silent': 1 if self.silent else 0,
'use_buffer': int(self.use_buffer),
'num_round': self.num_round,
'ntree_limit': self.ntree_limit,
'nthread': self.nthread,
'booster': self.booster,
'eta': self.eta,
'gamma': self.gamma,
'max_depth': self.max_depth,
'min_child_weight': self.min_child_weight,
'subsample': self.subsample,
'colsample_bytree': self.colsample_bytree,
'max_delta_step': self.max_delta_step,
'l': self.l,
'alpha': self.alpha,
'lambda_bias': self.lambda_bias,
'objective': self.objective,
'eval_metric': self.eval_metric,
'seed': self.seed,
'num_class': self.num_class,
}
results = xgb.cv(param, X, self.num_round, 3)
return results
def fit(self, X, y):
X = self.build_matrix(X, y)
param = {
'silent': 1 if self.silent else 0,
'use_buffer': int(self.use_buffer),
'num_round': self.num_round,
'ntree_limit': self.ntree_limit,
'nthread': self.nthread,
'booster': self.booster,
'eta': self.eta,
'gamma': self.gamma,
'max_depth': self.max_depth,
'min_child_weight': self.min_child_weight,
'subsample': self.subsample,
'colsample_bytree': self.colsample_bytree,
'max_delta_step': self.max_delta_step,
'l': self.l,
'alpha': self.alpha,
'lambda_bias': self.lambda_bias,
'objective': self.objective,
'eval_metric': self.eval_metric,
'seed': self.seed
}
if self.num_class is not None:
param['num_class']= self.num_class
watchlist = [(X,'train')]
self.bst = xgb.train(param, X, self.num_round, watchlist)
return self
def predict(self, X):
X = self.build_matrix(X)
return self.bst.predict(X)
def predict_proba(self, X):
X = self.build_matrix(X)
predictions = self.bst.predict(X)
if self.objective == 'multi:softprob': return predictions
return np.vstack([1 - predictions, predictions]).T