Skip to content

Commit

Permalink
Adaboost algorithm and support of changes in DAAL (#134)
Browse files Browse the repository at this point in the history
* DAAL_DEPRECATED added to ignore list

* adaboost params are added, weights are ignored for regressions without their support

* resultsToEvaluate parameter is added in Logistic Regression

* adaboost example modification

* set version of DAAL for adaboost

* check of DAAL version in Logistic Regression example

* another processing DAAL_DEPRECATED

* nClasses parameter in KNN algorithm

* stump, brownboost, logitboost algos
  • Loading branch information
Alexander-Makaryev authored Oct 16, 2019
1 parent 3bc0c39 commit 9e86424
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 20 deletions.
1 change: 1 addition & 0 deletions daal4py/sklearn/neighbors/kdtree_knn_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def fit(self, X, y):

# Fit the model
train_algo = d4p.kdtree_knn_classification_training(fptype=fptype,
nClasses=self.n_classes_,
engine=d4p.engines_mcg59(seed=self.seed_))
train_result = train_algo.compute(X, y_)

Expand Down
5 changes: 3 additions & 2 deletions examples/adaboost_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@
def main(readcsv=read_csv, method='defaultDense'):
infile = "./data/batch/adaboost_train.csv"
testfile = "./data/batch/adaboost_test.csv"
nClasses = 2

# Configure a adaboost training object
train_algo = d4p.adaboost_training()
train_algo = d4p.adaboost_training(nClasses=nClasses)

# Read data. Let's have 20 independent, and 1 dependent variable (for each observation)
indep_data = readcsv(infile, range(20))
Expand All @@ -44,7 +45,7 @@ def main(readcsv=read_csv, method='defaultDense'):
train_result = train_algo.compute(indep_data, dep_data)

# Now let's do some prediction
predict_algo = d4p.adaboost_prediction()
predict_algo = d4p.adaboost_prediction(nClasses=nClasses)
# read test data (with same #features)
pdata = readcsv(testfile, range(20))
# now predict using the model from the training above
Expand Down
5 changes: 3 additions & 2 deletions examples/kdtree_knn_classification_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ def main(readcsv=read_csv, method='defaultDense'):

# Read data. Let's use 5 features per observation
nFeatures = 5
nClasses = 5
train_data = readcsv(train_file, range(nFeatures))
train_labels = readcsv(train_file, range(nFeatures, nFeatures+1))

# Create an algorithm object and call compute
train_algo = d4p.kdtree_knn_classification_training()
train_algo = d4p.kdtree_knn_classification_training(nClasses=nClasses)
# 'weights' is optional argument, let's use equal weights
# in this case results must be the same as without weights
weights = np.ones((train_data.shape[0], 1))
Expand All @@ -64,6 +65,6 @@ def main(readcsv=read_csv, method='defaultDense'):

if __name__ == "__main__":
(train_result, predict_result, predict_labels) = main()
print("KD-tree based kNN classification results (first 20 observations):")
print("KD-tree based kNN classification results:")
print("Ground truth(observations #30-34):\n", predict_labels[30:35])
print("Classification results(observations #30-34):\n", predict_result.prediction[30:35])
11 changes: 9 additions & 2 deletions examples/log_reg_dense_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,15 @@ def main(readcsv=read_csv, method='defaultDense'):
predict_data = readcsv(testfile, range(nFeatures))

# set parameters and compute predictions
predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses,
resultsToCompute="computeClassesLabels|computeClassesProbabilities|computeClassesLogProbabilities")
# previous version has different interface
from daal4py import __daal_link_version__ as dv
daal_version = tuple(map(int, (dv[0:4], dv[4:8])))
if daal_version < (2020,0):
predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses,
resultsToCompute="computeClassesLabels|computeClassesProbabilities|computeClassesLogProbabilities")
else:
predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses,
resultsToEvaluate="computeClassLabels|computeClassProbabilities|computeClassLogProbabilities")
predict_result = predict_alg.compute(predict_data, train_result.model)
# the prediction result provides prediction, probabilities and logProbabilities
assert predict_result.probabilities.shape == (predict_data.shape[0], nClasses)
Expand Down
10 changes: 5 additions & 5 deletions examples/run_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
req_version = defaultdict(lambda:(2019,0))
req_version['decision_forest_classification_batch.py'] = (2019,1)
req_version['decision_forest_regression_batch.py'] = (2019,1)
req_version['adaboost_batch.py'] = (2020,1)
req_version['brownboost_batch.py'] = (2020,1)
req_version['logitboost_batch.py'] = (2020,1)
req_version['stump_classification_batch.py'] = (2020,1)
req_version['stump_regression_batch.py'] = (2020,1)
req_version['adaboost_batch.py'] = (2020,0)
req_version['brownboost_batch.py'] = (2020,0)
req_version['logitboost_batch.py'] = (2020,0)
req_version['stump_classification_batch.py'] = (2020,0)
req_version['stump_regression_batch.py'] = (2020,0)
req_version['saga_batch.py'] = (2019,3)
req_version['dbscan_batch.py'] = (2019,5)
req_version['lasso_regression_batch.py'] = (2019,5)
Expand Down
2 changes: 1 addition & 1 deletion generator/gen_daal4py.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class cython_interface(object):
'serializeImpl', 'deserializeImpl', 'serialImpl',
'getEpsilonVal', 'getMinVal', 'getMaxVal', 'getPMMLNumType', 'getInternalNumType', 'getIndexNumType',
'allocateNumericTableImpl', 'allocateImpl', 'allocate', 'initialize',
'setPartialResultStorage', 'addPartialResultStorage',]
'setPartialResultStorage', 'addPartialResultStorage']

# files we ignore/skip
ignore_files = ['daal_shared_ptr.h', 'daal.h', 'daal_win.h', 'algorithm_base_mode_batch.h',
Expand Down
2 changes: 2 additions & 0 deletions generator/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,8 @@ def parse_header(header, ignores):
# first strip of eol comments if it is not the link
if not re.search(r'https?://', l):
l = l.split('//')[0]
# delete 'DAAL_DEPRECATED'
l = l.replace('DAAL_DEPRECATED ', '')
# apply each parser, continue to next line if possible
for p in parsers:
if p.parse(l, ctxt):
Expand Down
10 changes: 7 additions & 3 deletions generator/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
# given a C++ namespace and a DAAL version, return if namespace/algo should be
# wrapped in daal4py.
def wrap_algo(algo, ver):
#return True if 'kmeans' in algo and not 'interface' in algo else False
# Ignore some algos if using older DAAL
if ver < (2020, 1) and any(x in algo for x in ['stump', 'adaboost', 'brownboost', 'logitboost',]):
if ver < (2020, 0) and any(x in algo for x in ['adaboost', 'stump', 'brownboost', 'logitboost',]):
return False
# ignore deprecated version of stump
if 'stump' in algo and not any(x in algo for x in ['stump::regression', 'stump::classification']):
Expand Down Expand Up @@ -73,6 +72,8 @@ def wrap_algo(algo, ver):
'algorithms::optimization_solver::lbfgs': [('function', 'daal::algorithms::optimization_solver::sum_of_functions::BatchPtr')],
'algorithms::optimization_solver::adagrad': [('function', 'daal::algorithms::optimization_solver::sum_of_functions::BatchPtr')],
'algorithms::dbscan': [('epsilon', 'fptype'), ('minObservations', 'size_t')],
'algorithms::adaboost::prediction': [('nClasses', 'size_t')],
'algorithms::adaboost::training': [('nClasses', 'size_t')],
}

# Some algorithms have no public constructors and need to be instantiated with 'create'
Expand Down Expand Up @@ -100,13 +101,15 @@ def wrap_algo(algo, ver):
ignore = {
'algorithms::kmeans::init': ['firstIteration', 'outputForStep5Required',], # internal for distributed
'algorithms::kmeans::init::interface1': ['nRowsTotal', 'offset', 'seed',], # internal for distributed, deprecated
'algorithms::gbt::regression::training': ['dependentVariables'], # dependentVariables from parent class is not used
'algorithms::gbt::regression::training': ['dependentVariables', 'weights'], # dependentVariables, weights from parent class is not used
'algorithms::decision_forest::training': ['seed',], # deprecated
'algorithms::decision_forest::classification::training': ['updatedEngine',], # output
'algorithms::decision_forest::regression::training': ['algorithms::regression::training::InputId', # InputId from parent class is not used
'updatedEngine',], # output
'algorithms::linear_regression::prediction': ['algorithms::linear_model::interceptFlag',], # parameter
'algorithms::linear_regression::training': ['weights',], # weights from parent class is not used
'algorithms::ridge_regression::prediction': ['algorithms::linear_model::interceptFlag',], # parameter
'algorithms::ridge_regression::training': ['weights',], # weights from parent class is not used
'algorithms::optimization_solver::sgd': ['optionalArgument', 'algorithms::optimization_solver::iterative_solver::OptionalResultId',
'pastUpdateVector', 'pastWorkValue', 'seed',], # internal stuff, deprecated
'algorithms::optimization_solver::lbfgs': ['optionalArgument', 'algorithms::optimization_solver::iterative_solver::OptionalResultId',
Expand All @@ -126,6 +129,7 @@ def wrap_algo(algo, ver):
'algorithms::kdtree_knn_classification': ['seed',], # deprecated
'algorithms::lasso_regression::training': ['optionalArgument'], # internal stuff
'algorithms::lasso_regression::prediction': ['algorithms::linear_model::interceptFlag',], # parameter
'algorithms::multi_class_classifier': ['algorithms::multi_class_classifier::getTwoClassClassifierModels',] # unsupported return type ModelPtr*
}

# List of InterFaces, classes that can be arguments to other algorithms
Expand Down
12 changes: 7 additions & 5 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,11 @@ def test_svm_batch(self):


gen_examples = [
('adaboost_batch', None, None, (2020, 1)),
('adaboost_batch', None, None, (2020, 0)),
('adagrad_mse_batch', 'adagrad_mse_batch.csv', 'minimum'),
('association_rules_batch', 'association_rules_batch.csv', 'confidence'),
('bacon_outlier_batch', 'multivariate_outlier_batch.csv', lambda r: r[1].weights),
('brownboost_batch', None, None, (2020, 1)),
('brownboost_batch', None, None, (2020, 0)),
('correlation_distance_batch', 'correlation_distance_batch.csv', lambda r: [[np.amin(r.correlationDistance)],
[np.amax(r.correlationDistance)],
[np.mean(r.correlationDistance)],
Expand Down Expand Up @@ -122,7 +122,7 @@ def test_svm_batch(self):
('linear_regression_streaming', 'linear_regression_batch.csv', lambda r: r[1].prediction),
('log_reg_binary_dense_batch', 'log_reg_binary_dense_batch.csv', lambda r: r[1].prediction),
('log_reg_dense_batch',),
('logitboost_batch', None, None, (2020, 1)),
('logitboost_batch', None, None, (2020, 0)),
('low_order_moms_dense_batch', 'low_order_moms_dense_batch.csv', lambda r: np.vstack((r.minimum,
r.maximum,
r.sum,
Expand Down Expand Up @@ -164,8 +164,8 @@ def test_svm_batch(self):
('sgd_logistic_loss_batch', 'sgd_logistic_loss_batch.csv', 'minimum'),
('sgd_mse_batch', 'sgd_mse_batch.csv', 'minimum'),
('sorting_batch',),
('stump_classification_batch', None, None, (2020, 1)),
('stump_regression_batch', None, None, (2020, 1)),
('stump_classification_batch', None, None, (2020, 0)),
('stump_regression_batch', None, None, (2020, 0)),
('svm_multiclass_batch', 'svm_multiclass_batch.csv', lambda r: r[0].prediction),
('univariate_outlier_batch', 'univariate_outlier_batch.csv', lambda r: r[1].weights),
('dbscan_batch', 'dbscan_batch.csv', 'assignments', (2019, 5)),
Expand Down Expand Up @@ -199,6 +199,8 @@ def call(self, ex):
# some algos do not support CSR matrices
if ex.__name__.startswith('sorting'):
self.skipTest("not supporting CSR")
if any (ex.__name__.startswith(x) for x in ['adaboost', 'brownboost', 'stump_classification']):
self.skipTest("not supporting CSR")
method = 'singlePassCSR' if any(x in ex.__name__ for x in ['low_order_moms', 'covariance']) else 'fastCSR'
# cannot use fastCSR ofr implicit als; bug in Intel(R) DAAL?
if 'implicit_als' in ex.__name__:
Expand Down

0 comments on commit 9e86424

Please sign in to comment.