Adaboost algorithm and support of changes in DAAL (#134)

* DAAL_DEPRECATED added to ignore list * adaboost params are added, weights are ignored for regressions without their support * resultsToEvaluate parameter is added in Logistic Regression * adaboost example modification * set version of DAAL for adaboost * check of DAAL version in Logistic Regression example * another processing DAAL_DEPRECATED * nClasses parameter in KNN algorithm * stump, brownboost, logitboost algos
uxlfoundation · Oct 16, 2019 · 9e86424 · 9e86424
1 parent 3bc0c39
commit 9e86424
Show file tree

Hide file tree

Showing 9 changed files with 38 additions and 20 deletions.
diff --git a/daal4py/sklearn/neighbors/kdtree_knn_classifier.py b/daal4py/sklearn/neighbors/kdtree_knn_classifier.py
@@ -123,6 +123,7 @@ def fit(self, X, y):
 
         # Fit the model
         train_algo = d4p.kdtree_knn_classification_training(fptype=fptype,
+                                                            nClasses=self.n_classes_,
                                                             engine=d4p.engines_mcg59(seed=self.seed_))
         train_result = train_algo.compute(X, y_)
 

diff --git a/examples/adaboost_batch.py b/examples/adaboost_batch.py
@@ -33,9 +33,10 @@
 def main(readcsv=read_csv, method='defaultDense'):
     infile = "./data/batch/adaboost_train.csv"
     testfile = "./data/batch/adaboost_test.csv"
+    nClasses = 2
 
     # Configure a adaboost training object
-    train_algo = d4p.adaboost_training()
+    train_algo = d4p.adaboost_training(nClasses=nClasses)
 
     # Read data. Let's have 20 independent, and 1 dependent variable (for each observation)
     indep_data = readcsv(infile, range(20))
@@ -44,7 +45,7 @@ def main(readcsv=read_csv, method='defaultDense'):
     train_result = train_algo.compute(indep_data, dep_data)
 
     # Now let's do some prediction
-    predict_algo = d4p.adaboost_prediction()
+    predict_algo = d4p.adaboost_prediction(nClasses=nClasses)
     # read test data (with same #features)
     pdata = readcsv(testfile, range(20))
     # now predict using the model from the training above

diff --git a/examples/kdtree_knn_classification_batch.py b/examples/kdtree_knn_classification_batch.py
@@ -38,11 +38,12 @@ def main(readcsv=read_csv, method='defaultDense'):
 
     # Read data. Let's use 5 features per observation
     nFeatures = 5
+    nClasses = 5
     train_data   = readcsv(train_file, range(nFeatures))
     train_labels = readcsv(train_file, range(nFeatures, nFeatures+1))
 
     # Create an algorithm object and call compute
-    train_algo = d4p.kdtree_knn_classification_training()
+    train_algo = d4p.kdtree_knn_classification_training(nClasses=nClasses)
     # 'weights' is optional argument, let's use equal weights
     # in this case results must be the same as without weights
     weights = np.ones((train_data.shape[0], 1))
@@ -64,6 +65,6 @@ def main(readcsv=read_csv, method='defaultDense'):
 
 if __name__ == "__main__":
     (train_result, predict_result, predict_labels) = main()    
-    print("KD-tree based kNN classification results (first 20 observations):")
+    print("KD-tree based kNN classification results:")
     print("Ground truth(observations #30-34):\n", predict_labels[30:35])
     print("Classification results(observations #30-34):\n", predict_result.prediction[30:35])
diff --git a/examples/log_reg_dense_batch.py b/examples/log_reg_dense_batch.py
@@ -51,8 +51,15 @@ def main(readcsv=read_csv, method='defaultDense'):
     predict_data = readcsv(testfile, range(nFeatures))
 
     # set parameters and compute predictions
-    predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses,
-                                                     resultsToCompute="computeClassesLabels|computeClassesProbabilities|computeClassesLogProbabilities")
+    # previous version has different interface
+    from daal4py import __daal_link_version__ as dv
+    daal_version = tuple(map(int, (dv[0:4], dv[4:8])))
+    if daal_version < (2020,0):
+        predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses,
+                                                         resultsToCompute="computeClassesLabels|computeClassesProbabilities|computeClassesLogProbabilities")
+    else:
+        predict_alg = d4p.logistic_regression_prediction(nClasses=nClasses,
+                                                         resultsToEvaluate="computeClassLabels|computeClassProbabilities|computeClassLogProbabilities")
     predict_result = predict_alg.compute(predict_data, train_result.model)
     # the prediction result provides prediction, probabilities and logProbabilities
     assert predict_result.probabilities.shape == (predict_data.shape[0], nClasses)

diff --git a/examples/run_examples.py b/examples/run_examples.py
@@ -41,11 +41,11 @@
 req_version = defaultdict(lambda:(2019,0))
 req_version['decision_forest_classification_batch.py'] = (2019,1)
 req_version['decision_forest_regression_batch.py'] = (2019,1)
-req_version['adaboost_batch.py'] = (2020,1)
-req_version['brownboost_batch.py'] = (2020,1)
-req_version['logitboost_batch.py'] = (2020,1)
-req_version['stump_classification_batch.py'] = (2020,1)
-req_version['stump_regression_batch.py'] = (2020,1)
+req_version['adaboost_batch.py'] = (2020,0)
+req_version['brownboost_batch.py'] = (2020,0)
+req_version['logitboost_batch.py'] = (2020,0)
+req_version['stump_classification_batch.py'] = (2020,0)
+req_version['stump_regression_batch.py'] = (2020,0)
 req_version['saga_batch.py'] = (2019,3)
 req_version['dbscan_batch.py'] = (2019,5)
 req_version['lasso_regression_batch.py'] = (2019,5)

diff --git a/generator/gen_daal4py.py b/generator/gen_daal4py.py
@@ -112,7 +112,7 @@ class cython_interface(object):
                'serializeImpl', 'deserializeImpl', 'serialImpl',
                'getEpsilonVal', 'getMinVal', 'getMaxVal', 'getPMMLNumType', 'getInternalNumType', 'getIndexNumType',
                'allocateNumericTableImpl', 'allocateImpl', 'allocate', 'initialize',
-               'setPartialResultStorage', 'addPartialResultStorage',]
+               'setPartialResultStorage', 'addPartialResultStorage']
 
     # files we ignore/skip
     ignore_files = ['daal_shared_ptr.h', 'daal.h', 'daal_win.h', 'algorithm_base_mode_batch.h',

diff --git a/generator/parse.py b/generator/parse.py
@@ -518,6 +518,8 @@ def parse_header(header, ignores):
         # first strip of eol comments if it is not the link
         if not re.search(r'https?://', l):
             l = l.split('//')[0]
+        # delete 'DAAL_DEPRECATED'
+        l = l.replace('DAAL_DEPRECATED ', '')
         # apply each parser, continue to next line if possible
         for p in parsers:
             if p.parse(l, ctxt):

diff --git a/generator/wrappers.py b/generator/wrappers.py
@@ -20,9 +20,8 @@
 # given a C++ namespace and a DAAL version, return if namespace/algo should be
 # wrapped in daal4py.
 def wrap_algo(algo, ver):
-    #return True if 'kmeans' in algo and not 'interface' in algo else False
     # Ignore some algos if using older DAAL
-    if ver < (2020, 1) and any(x in algo for x in ['stump', 'adaboost', 'brownboost', 'logitboost',]):
+    if ver < (2020, 0) and any(x in algo for x in ['adaboost', 'stump', 'brownboost', 'logitboost',]):
         return False
     # ignore deprecated version of stump
     if 'stump' in algo and not any(x in algo for x in ['stump::regression', 'stump::classification']):
@@ -73,6 +72,8 @@ def wrap_algo(algo, ver):
     'algorithms::optimization_solver::lbfgs': [('function', 'daal::algorithms::optimization_solver::sum_of_functions::BatchPtr')],
     'algorithms::optimization_solver::adagrad': [('function', 'daal::algorithms::optimization_solver::sum_of_functions::BatchPtr')],
     'algorithms::dbscan': [('epsilon', 'fptype'), ('minObservations', 'size_t')],
+    'algorithms::adaboost::prediction': [('nClasses', 'size_t')],
+    'algorithms::adaboost::training': [('nClasses', 'size_t')],
 }
 
 # Some algorithms have no public constructors and need to be instantiated with 'create'
@@ -100,13 +101,15 @@ def wrap_algo(algo, ver):
 ignore = {
     'algorithms::kmeans::init': ['firstIteration', 'outputForStep5Required',], # internal for distributed
     'algorithms::kmeans::init::interface1': ['nRowsTotal', 'offset', 'seed',], # internal for distributed, deprecated
-    'algorithms::gbt::regression::training': ['dependentVariables'], # dependentVariables from parent class is not used
+    'algorithms::gbt::regression::training': ['dependentVariables', 'weights'], # dependentVariables, weights from parent class is not used
     'algorithms::decision_forest::training': ['seed',], # deprecated
     'algorithms::decision_forest::classification::training': ['updatedEngine',], # output
     'algorithms::decision_forest::regression::training': ['algorithms::regression::training::InputId', # InputId from parent class is not used
                                                           'updatedEngine',], # output
     'algorithms::linear_regression::prediction': ['algorithms::linear_model::interceptFlag',], # parameter
+    'algorithms::linear_regression::training': ['weights',], # weights from parent class is not used
     'algorithms::ridge_regression::prediction': ['algorithms::linear_model::interceptFlag',], # parameter
+    'algorithms::ridge_regression::training': ['weights',], # weights from parent class is not used
     'algorithms::optimization_solver::sgd': ['optionalArgument', 'algorithms::optimization_solver::iterative_solver::OptionalResultId',
                                              'pastUpdateVector', 'pastWorkValue', 'seed',], # internal stuff, deprecated
     'algorithms::optimization_solver::lbfgs': ['optionalArgument', 'algorithms::optimization_solver::iterative_solver::OptionalResultId',
@@ -126,6 +129,7 @@ def wrap_algo(algo, ver):
     'algorithms::kdtree_knn_classification': ['seed',], # deprecated
     'algorithms::lasso_regression::training': ['optionalArgument'], # internal stuff
     'algorithms::lasso_regression::prediction': ['algorithms::linear_model::interceptFlag',], # parameter
+    'algorithms::multi_class_classifier': ['algorithms::multi_class_classifier::getTwoClassClassifierModels',] # unsupported return type ModelPtr*
 }
 
 # List of InterFaces, classes that can be arguments to other algorithms

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -86,11 +86,11 @@ def test_svm_batch(self):
 
 
 gen_examples = [
-    ('adaboost_batch', None, None, (2020, 1)),
+    ('adaboost_batch', None, None, (2020, 0)),
     ('adagrad_mse_batch', 'adagrad_mse_batch.csv', 'minimum'),
     ('association_rules_batch', 'association_rules_batch.csv', 'confidence'),
     ('bacon_outlier_batch', 'multivariate_outlier_batch.csv', lambda r: r[1].weights),
-    ('brownboost_batch', None, None, (2020, 1)),
+    ('brownboost_batch', None, None, (2020, 0)),
     ('correlation_distance_batch', 'correlation_distance_batch.csv', lambda r: [[np.amin(r.correlationDistance)],
                                                                                 [np.amax(r.correlationDistance)],
                                                                                 [np.mean(r.correlationDistance)],
@@ -122,7 +122,7 @@ def test_svm_batch(self):
     ('linear_regression_streaming', 'linear_regression_batch.csv', lambda r: r[1].prediction),
     ('log_reg_binary_dense_batch', 'log_reg_binary_dense_batch.csv', lambda r: r[1].prediction),
     ('log_reg_dense_batch',),
-    ('logitboost_batch', None, None, (2020, 1)),
+    ('logitboost_batch', None, None, (2020, 0)),
     ('low_order_moms_dense_batch', 'low_order_moms_dense_batch.csv', lambda r: np.vstack((r.minimum,
                                                                                           r.maximum,
                                                                                           r.sum,
@@ -164,8 +164,8 @@ def test_svm_batch(self):
     ('sgd_logistic_loss_batch', 'sgd_logistic_loss_batch.csv', 'minimum'),
     ('sgd_mse_batch', 'sgd_mse_batch.csv', 'minimum'),
     ('sorting_batch',),
-    ('stump_classification_batch', None, None, (2020, 1)),
-    ('stump_regression_batch', None, None, (2020, 1)),
+    ('stump_classification_batch', None, None, (2020, 0)),
+    ('stump_regression_batch', None, None, (2020, 0)),
     ('svm_multiclass_batch', 'svm_multiclass_batch.csv', lambda r: r[0].prediction),
     ('univariate_outlier_batch', 'univariate_outlier_batch.csv', lambda r: r[1].weights),
     ('dbscan_batch', 'dbscan_batch.csv', 'assignments', (2019, 5)),
@@ -199,6 +199,8 @@ def call(self, ex):
         # some algos do not support CSR matrices
         if  ex.__name__.startswith('sorting'):
             self.skipTest("not supporting CSR")
+        if  any (ex.__name__.startswith(x) for x in ['adaboost', 'brownboost', 'stump_classification']):
+            self.skipTest("not supporting CSR")
         method = 'singlePassCSR' if any(x in ex.__name__ for x in ['low_order_moms', 'covariance']) else 'fastCSR'
         # cannot use fastCSR ofr implicit als; bug in Intel(R) DAAL?
         if 'implicit_als' in ex.__name__: