From 4576b64f7e2d758122f44ad3062fbd54a98e6023 Mon Sep 17 00:00:00 2001 From: Manish Amde Date: Sat, 22 Mar 2014 20:25:34 -0700 Subject: [PATCH] documentation and for to while loop conversion --- .../spark/mllib/tree/DecisionTree.scala | 303 ++++++++++++------ 1 file changed, 204 insertions(+), 99 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index b7492038445cc..3d5eb0fcf263b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -81,6 +81,8 @@ class DecisionTree private(val strategy: Strategy) extends Serializable with Log // Each data sample is checked for validity w.r.t to each node at a given level -- i.e., // the sample is only used for the split calculation at the node if the sampled would have // still survived the filters of the parent nodes. + + // TODO: Convert for loop to while loop breakable { for (level <- 0 until maxDepth) { @@ -120,7 +122,7 @@ class DecisionTree private(val strategy: Strategy) extends Serializable with Log } /** - * Extract the decision tree node information for th given tree level and node index + * Extract the decision tree node information for the given tree level and node index */ private def extractNodeInfo( nodeSplitStats: (Split, InformationGainStats), @@ -151,6 +153,7 @@ class DecisionTree private(val strategy: Strategy) extends Serializable with Log : Unit = { // 0 corresponds to the left child node and 1 corresponds to the right child node. + // TODO: Convert to while loop for (i <- 0 to 1) { // Calculating the index of the node from the node level and the index at the current level val nodeIndex = scala.math.pow(2, level + 1).toInt - 1 + 2 * index + i @@ -264,6 +267,31 @@ object DecisionTree extends Serializable with Logging { bins: Array[Array[Bin]]) : Array[(Split, InformationGainStats)] = { + + // The high-level description for the best split optimizations are noted here. + // + // *Level-wise training* + // We perform bin calculations for all nodes at the given level to avoid making multiple + // passes over the data. Thus, for a slightly increased computation and storage cost we save + // several iterations over the data especially at higher levels of the decision tree. + // + // *Bin-wise computation* + // We use a bin-wise best split computation strategy instead of a straightforward best split + // computation strategy. Instead of analyzing each sample for contribution to the left/right + // child node impurity of every split, we first categorize each feature of a sample into a + // bin. Each bin is an interval between a low and high split. Since each splits, and thus bin, + // is ordered (read ordering for categorical variables in the findSplitsBins method), + // we exploit this structure to calculate aggregates for bins and then use these aggregates + // to calculate information gain for each split. + // + // *Aggregation over partitions* + // Instead of performing a flatMap/reduceByKey operation, we exploit the fact that we know + // the number of splits in advance. Thus, we store the aggregates (at the appropriate + // indices) in a single array for all bins and rely upon the RDD aggregate method to + // drastically reduce the communication overhead. + + // Implementation below + // Common calculations for multiple nested methods val numNodes = scala.math.pow(2, level).toInt logDebug("numNodes = " + numNodes) @@ -294,6 +322,7 @@ object DecisionTree extends Serializable with Logging { return false } + // Apply each filter and check sample validity. Return false when invalid condition found. for (filter <- parentFilters) { val features = labeledPoint.features val featureIndex = filter.split.feature @@ -316,12 +345,13 @@ object DecisionTree extends Serializable with Logging { } } + + //Return true when the sample is valid for all filters true } - // TODO: Unit test this /** - * Finds the right bin for the given feature + * Find bin for one feature */ def findBin( featureIndex: Int, @@ -332,9 +362,12 @@ object DecisionTree extends Serializable with Logging { val binForFeatures = bins(featureIndex) val feature = labeledPoint.features(featureIndex) + /** + * Binary search helper method for continuous feature + */ def binarySearchForBins(): Int = { var left = 0 - var right = binForFeatures.length-1 + var right = binForFeatures.length - 1 while (left <= right) { val mid = left + (right - left) / 2 val bin = binForFeatures(mid) @@ -353,13 +386,10 @@ object DecisionTree extends Serializable with Logging { -1 } - if (isFeatureContinuous){ - val binIndex = binarySearchForBins() - if (binIndex == -1){ - throw new UnknownError("no bin was found for continuous variable.") - } - binIndex - } else { + /** + * Sequential search helper method to find bin for categorical feature + */ + def sequentialBinSearchForCategoricalFeature() : Int = { val numCategoricalBins = strategy.categoricalFeaturesInfo(featureIndex) var binIndex = 0 while (binIndex < numCategoricalBins) { @@ -371,26 +401,40 @@ object DecisionTree extends Serializable with Logging { } binIndex += 1 } - throw new UnknownError("no bin was found for categorical variable.") - + -1 } + if (isFeatureContinuous){ + // Perform binary search for finding bin for continuous features. + val binIndex = binarySearchForBins() + if (binIndex == -1){ + throw new UnknownError("no bin was found for continuous variable.") + } + binIndex + } else { + // Perform sequential search to find bin for categorical features. + val binIndex = sequentialBinSearchForCategoricalFeature() + if (binIndex == -1){ + throw new UnknownError("no bin was found for categorical variable.") + } + binIndex + } } /** - * Finds bins for all nodes (and all features) at a given level k features, - * l nodes (level = log2(l)). - * Storage label, b11, b12, b13, .., b1k, - * b21, b22, .. , b2k, - * bl1, bl2, .. , blk - * Denotes invalid sample for tree by noting bin for feature 1 as -1 + * Finds bins for all nodes (and all features) at a given level. + * For l nodes, k features the storage is as follows: + * label, b_11, b_12, .. , b_1k, b_21, b_22, .. , b_2k, b_l1, b_l2, .. , b_lk + * where b_ij is an integer between 0 and numBins - 1. + * Invalid sample is denoted by noting bin for feature 1 as -1 */ def findBinsForLevel(labeledPoint: LabeledPoint): Array[Double] = { // calculating bin index and label per feature per node val arr = new Array[Double](1 + (numFeatures * numNodes)) arr(0) = labeledPoint.label - for (nodeIndex <- 0 until numNodes) { + var nodeIndex = 0 + while (nodeIndex < numNodes) { val parentFilters = findParentFilters(nodeIndex) // Find out whether the sample qualifies for the particular node val sampleValid = isSampleValid(parentFilters, labeledPoint) @@ -406,17 +450,15 @@ object DecisionTree extends Serializable with Logging { featureIndex += 1 } } + nodeIndex += 1 } arr } /** - * Performs a sequential aggregation over a partition for classification. - * - * for p bins, k features, l nodes (level = log2(l)) storage is of the form: - * b111_left_count,b111_right_count, .... , .. - * .. bpk1_left_count, bpk1_right_count, .... , .. - * .. bpkl_left_count, bpkl_right_count + * Performs a sequential aggregation over a partition for classification. For l nodes, + * k features, either the left count or the right count of one of the p bins is + * incremented based upon whether the feature is classified as 0 or 1. * * @param agg Array[Double] storing aggregate calculation of size * 2*numSplits*numFeatures*numNodes for classification @@ -425,32 +467,38 @@ object DecisionTree extends Serializable with Logging { * for classification */ def classificationBinSeqOp(arr: Array[Double], agg: Array[Double]) { - for (nodeIndex <- 0 until numNodes) { + // Iterating over all nodes + var nodeIndex = 0 + while (nodeIndex < numNodes) { + // Checking whether the instance was valid for this nodeIndex val validSignalIndex = 1 + numFeatures * nodeIndex val isSampleValidForNode = arr(validSignalIndex) != InvalidBinIndex if (isSampleValidForNode) { + // Actual class label val label = arr(0) - for (featureIndex <- 0 until numFeatures) { + // Iterating over all features + var featureIndex = 0 + while (featureIndex < numFeatures) { + // Finding the bin index for this feature val arrShift = 1 + numFeatures * nodeIndex - val aggShift = 2 * numBins * numFeatures * nodeIndex val arrIndex = arrShift + featureIndex + // Updating the left or right count for one bin + val aggShift = 2 * numBins * numFeatures * nodeIndex val aggIndex = aggShift + 2 * featureIndex * numBins + arr(arrIndex).toInt * 2 label match { case (0.0) => agg(aggIndex) = agg(aggIndex) + 1 case (1.0) => agg(aggIndex + 1) = agg(aggIndex + 1) + 1 } + featureIndex += 1 } } + nodeIndex += 1 } } /** - * Performs a sequential aggregation over a partition for regression. - * - * for p bins, k features, l nodes (level = log2(l)) storage is of the form: - * b111_count,b111_sum, b111_sum_squares .... , .. - * .. bpk1_count, bpk1_sum, bpk1_sum_squares, .... , .. - * .. bpkl_count, bpkl_sum, bpkl_sum_squares + * Performs a sequential aggregation over a partition for regression. For l nodes, k features, + * the count, sum, sum of squares of one of the p bins is incremented. * * @param agg Array[Double] storing aggregate calculation of size * 3*numSplits*numFeatures*numNodes for classification @@ -459,37 +507,37 @@ object DecisionTree extends Serializable with Logging { * 3*numSplits*numFeatures*numNodes for regression */ def regressionBinSeqOp(arr: Array[Double], agg: Array[Double]) { - for (nodeIndex <- 0 until numNodes) { + // Iterating over all nodes + var nodeIndex = 0 + while (nodeIndex < numNodes) { + // Checking whether the instance was valid for this nodeIndex val validSignalIndex = 1 + numFeatures * nodeIndex val isSampleValidForNode = arr(validSignalIndex) != InvalidBinIndex if (isSampleValidForNode) { + // Actual class label val label = arr(0) - for (feature <- 0 until numFeatures) { + // Iterating over all features + var featureIndex = 0 + while (featureIndex < numFeatures) { + // Finding the bin index for this feature val arrShift = 1 + numFeatures * nodeIndex + val arrIndex = arrShift + featureIndex + // updating count, sum, sum^2 for one bin val aggShift = 3 * numBins * numFeatures * nodeIndex - val arrIndex = arrShift + feature - val aggIndex = aggShift + 3 * feature * numBins + arr(arrIndex).toInt * 3 - //count, sum, sum^2 + val aggIndex = aggShift + 3 * featureIndex * numBins + arr(arrIndex).toInt * 3 agg(aggIndex) = agg(aggIndex) + 1 agg(aggIndex + 1) = agg(aggIndex + 1) + label agg(aggIndex + 2) = agg(aggIndex + 2) + label*label + // increment featureIndex + featureIndex += 1 } } + nodeIndex += 1 } } /** * Performs a sequential aggregation over a partition. - * for p bins, k features, l nodes (level = log2(l)) storage is of the form: - * b111_left_count,b111_right_count, .... , .... - * bpk1_left_count, bpk1_right_count, .... , ...., bpkl_left_count, bpkl_right_count - * @param agg Array[Double] storing aggregate calculation of size - * 2*numSplits*numFeatures*numNodes for classification and - * 3*numSplits*numFeatures*numNodes for regression - * @param arr Array[Double] of size 1+(numFeatures*numNodes) - * @return Array[Double] storing aggregate calculation of size - * 2*numSplits*numFeatures*numNodes for classification and - * 3*numSplits*numFeatures*numNodes for regression */ def binSeqOp(agg: Array[Double], arr: Array[Double]): Array[Double] = { strategy.algo match { @@ -499,6 +547,7 @@ object DecisionTree extends Serializable with Logging { agg } + // Calculating bin aggregate length for classification or regression val binAggregateLength = strategy.algo match { case Classification => 2*numBins * numFeatures * numNodes case Regression => 3*numBins * numFeatures * numNodes @@ -512,27 +561,17 @@ object DecisionTree extends Serializable with Logging { * @return Combined aggregate from agg1 and agg2 */ def binCombOp(agg1: Array[Double], agg2: Array[Double]): Array[Double] = { - strategy.algo match { - case Classification => { - val combinedAggregate = new Array[Double](binAggregateLength) - for (index <- 0 until binAggregateLength){ - combinedAggregate(index) = agg1(index) + agg2(index) - } - combinedAggregate - } - case Regression => { - val combinedAggregate = new Array[Double](binAggregateLength) - for (index <- 0 until binAggregateLength){ - combinedAggregate(index) = agg1(index) + agg2(index) - } - combinedAggregate - } + var index = 0 + val combinedAggregate = new Array[Double](binAggregateLength) + while (index < binAggregateLength){ + combinedAggregate(index) = agg1(index) + agg2(index) + index += 1 } + combinedAggregate } - logDebug("input = " + input.count) + // find feature bins for all nodes at a level val binMappedRDD = input.map(x => findBinsForLevel(x)) - logDebug("binMappedRDD.count = " + binMappedRDD.count) // calculate bin aggregates val binAggregates = { @@ -541,7 +580,7 @@ object DecisionTree extends Serializable with Logging { logDebug("binAggregates.length = " + binAggregates.length) /** - * Calculates the information gain for all splits + * Calculates the information gain for all splits based upon left/right split aggregates * @param leftNodeAgg left node aggregates * @param featureIndex feature index * @param splitIndex split index @@ -572,6 +611,7 @@ object DecisionTree extends Serializable with Logging { if (level > 0) { topImpurity } else { + // Calculating impurity for root node strategy.impurity.calculate(left0Count + right0Count, left1Count + right1Count) } } @@ -614,6 +654,7 @@ object DecisionTree extends Serializable with Logging { if (level > 0) { topImpurity } else { + // Calculating impurity for root node val count = leftCount + rightCount val sum = leftSum + rightSum val sumSquares = leftSumSquares + rightSumSquares @@ -623,11 +664,11 @@ object DecisionTree extends Serializable with Logging { if (leftCount == 0) { return new InformationGainStats(0, topImpurity, Double.MinValue, topImpurity, - rightSum/rightCount) + rightSum / rightCount) } if (rightCount == 0) { return new InformationGainStats(0, topImpurity ,topImpurity, - Double.MinValue, leftSum/leftCount) + Double.MinValue, leftSum / leftCount) } val leftImpurity = strategy.impurity.calculate(leftCount, leftSum, leftSumSquares) @@ -644,16 +685,16 @@ object DecisionTree extends Serializable with Logging { } } - val predict = (leftSum + rightSum)/(leftCount + rightCount) + val predict = (leftSum + rightSum) / (leftCount + rightCount) new InformationGainStats(gain, impurity, leftImpurity, rightImpurity, predict) } } } - /** + /** * Extracts left and right split aggregates - * @param binData Array[Double] of size 2*numFeatures*numSplits + * @param binData Array[Double] of size 2*numFeatures*numSplits * @return (leftNodeAgg, rightNodeAgg) tuple of type (Array[Double], * Array[Double]) where each array is of size(numFeature,2*(numSplits-1)) */ @@ -663,58 +704,90 @@ object DecisionTree extends Serializable with Logging { strategy.algo match { case Classification => { - + // Initializing left and right split aggregates val leftNodeAgg = Array.ofDim[Double](numFeatures, 2 * (numBins - 1)) val rightNodeAgg = Array.ofDim[Double](numFeatures, 2 * (numBins - 1)) - for (featureIndex <- 0 until numFeatures) { + // Iterating over all features + var featureIndex = 0 + while (featureIndex < numFeatures) { + // shift for this featureIndex val shift = 2*featureIndex*numBins + + // left node aggregate for the lowest split leftNodeAgg(featureIndex)(0) = binData(shift + 0) leftNodeAgg(featureIndex)(1) = binData(shift + 1) + + // right node aggregate for the highest split rightNodeAgg(featureIndex)(2 * (numBins - 2)) = binData(shift + (2 * (numBins - 1))) rightNodeAgg(featureIndex)(2 * (numBins - 2) + 1) = binData(shift + (2 * (numBins - 1)) + 1) - for (splitIndex <- 1 until numBins - 1) { - leftNodeAgg(featureIndex)(2 * splitIndex) = binData(shift + 2*splitIndex) + + + // Iterating over all splits + var splitIndex = 1 + while (splitIndex < numBins - 1) { + // calculating left node aggregate for a split as a sum of left node aggregate of a + // lower split and the left bin aggregate of a bin where the split is a high split + leftNodeAgg(featureIndex)(2 * splitIndex) = binData(shift + 2 * splitIndex) + leftNodeAgg(featureIndex)(2 * splitIndex - 2) - leftNodeAgg(featureIndex)(2 * splitIndex + 1) = binData(shift + 2*splitIndex + 1) + + leftNodeAgg(featureIndex)(2 * splitIndex + 1) = binData(shift + 2 * splitIndex + 1) + leftNodeAgg(featureIndex)(2 * splitIndex - 2 + 1) + + // calculating right node aggregate for a split as a sum of right node aggregate of a + // higher split and the right bin aggregate of a bin where the split is a low split rightNodeAgg(featureIndex)(2 * (numBins - 2 - splitIndex)) = binData(shift + (2 *(numBins - 2 - splitIndex))) + rightNodeAgg(featureIndex)(2 * (numBins - 1 - splitIndex)) rightNodeAgg(featureIndex)(2 * (numBins - 2 - splitIndex) + 1) = binData(shift + (2* (numBins - 2 - splitIndex) + 1)) + rightNodeAgg(featureIndex)(2 * (numBins - 1 - splitIndex) + 1) + + splitIndex += 1 } + featureIndex += 1 } (leftNodeAgg, rightNodeAgg) } case Regression => { - + // Initializing left and right split aggregates val leftNodeAgg = Array.ofDim[Double](numFeatures, 3 * (numBins - 1)) val rightNodeAgg = Array.ofDim[Double](numFeatures, 3 * (numBins - 1)) - for (featureIndex <- 0 until numFeatures) { + // Iterating over all features + var featureIndex = 0 + while (featureIndex < numFeatures) { + // shift for this featureIndex val shift = 3*featureIndex*numBins + // left node aggregate for the lowest split leftNodeAgg(featureIndex)(0) = binData(shift + 0) leftNodeAgg(featureIndex)(1) = binData(shift + 1) leftNodeAgg(featureIndex)(2) = binData(shift + 2) + + // right node aggregate for the highest split rightNodeAgg(featureIndex)(3 * (numBins - 2)) = binData(shift + (3 * (numBins - 1))) rightNodeAgg(featureIndex)(3 * (numBins - 2) + 1) = binData(shift + (3 * (numBins - 1)) + 1) rightNodeAgg(featureIndex)(3 * (numBins - 2) + 2) = binData(shift + (3 * (numBins - 1)) + 2) - for (splitIndex <- 1 until numBins - 1) { + + // Iterating over all splits + var splitIndex = 1 + while (splitIndex < numBins - 1) { + // calculating left node aggregate for a split as a sum of left node aggregate of a + // lower split and the left bin aggregate of a bin where the split is a high split leftNodeAgg(featureIndex)(3 * splitIndex) - = binData(shift + 3*splitIndex) + + = binData(shift + 3 * splitIndex) + leftNodeAgg(featureIndex)(3 * splitIndex - 3) leftNodeAgg(featureIndex)(3 * splitIndex + 1) - = binData(shift + 3*splitIndex + 1) + + = binData(shift + 3 * splitIndex + 1) + leftNodeAgg(featureIndex)(3 * splitIndex - 3 + 1) leftNodeAgg(featureIndex)(3 * splitIndex + 2) - = binData(shift + 3*splitIndex + 2) + + = binData(shift + 3 * splitIndex + 2) + leftNodeAgg(featureIndex)(3 * splitIndex - 3 + 2) + + // calculating right node aggregate for a split as a sum of right node aggregate of a + // higher split and the right bin aggregate of a bin where the split is a low split rightNodeAgg(featureIndex)(3 * (numBins - 2 - splitIndex)) = binData(shift + (3 * (numBins - 2 - splitIndex))) + rightNodeAgg(featureIndex)(3 * (numBins - 1 - splitIndex)) @@ -724,13 +797,19 @@ object DecisionTree extends Serializable with Logging { rightNodeAgg(featureIndex)(3 * (numBins - 2 - splitIndex) + 2) = binData(shift + (3 * (numBins - 2 - splitIndex) + 2)) + rightNodeAgg(featureIndex)(3 * (numBins - 1 - splitIndex) + 2) + + splitIndex += 1 } + featureIndex += 1 } (leftNodeAgg, rightNodeAgg) } } } + /** + * Calculates information gain for all nodes splits + */ def calculateGainsForAllNodeSplits( leftNodeAgg: Array[Array[Double]], rightNodeAgg: Array[Array[Double]], @@ -749,10 +828,10 @@ object DecisionTree extends Serializable with Logging { } /** - * Find the best split for a node given bin aggregate data + * Find the best split for a node * @param binData Array[Double] of size 2*numSplits*numFeatures * @param nodeImpurity impurity of the top node - * @return + * @return tuple of split and information gain */ def binsToBestSplit( binData: Array[Double], @@ -760,23 +839,33 @@ object DecisionTree extends Serializable with Logging { : (Split, InformationGainStats) = { logDebug("node impurity = " + nodeImpurity) + + //extract left right node aggregates val (leftNodeAgg, rightNodeAgg) = extractLeftRightNodeAggregates(binData) + + // calculate gains for all splits val gains = calculateGainsForAllNodeSplits(leftNodeAgg, rightNodeAgg, nodeImpurity) val (bestFeatureIndex,bestSplitIndex, gainStats) = { - var bestFeatureIndex = 0 - var bestSplitIndex = 0 // Initialization with infeasible values + var bestFeatureIndex = Int.MinValue + var bestSplitIndex = Int.MinValue var bestGainStats = new InformationGainStats(Double.MinValue,-1.0,-1.0,-1.0,-1) - for (featureIndex <- 0 until numFeatures) { - for (splitIndex <- 0 until numBins - 1){ + // Iterating over features + var featureIndex = 0 + while (featureIndex < numFeatures) { + // Iterating over all splits + var splitIndex = 0 + while (splitIndex < numBins - 1){ val gainStats = gains(featureIndex)(splitIndex) if(gainStats.gain > bestGainStats.gain) { bestGainStats = gainStats bestFeatureIndex = featureIndex bestSplitIndex = splitIndex } + splitIndex += 1 } + featureIndex += 1 } (bestFeatureIndex,bestSplitIndex,bestGainStats) } @@ -786,8 +875,9 @@ object DecisionTree extends Serializable with Logging { (splits(bestFeatureIndex)(bestSplitIndex),gainStats) } - // Calculate best splits for all nodes at a given level - val bestSplits = new Array[(Split, InformationGainStats)](numNodes) + /** + * get bin data for one node + */ def getBinDataForNode(node: Int): Array[Double] = { strategy.algo match { case Classification => { @@ -803,14 +893,21 @@ object DecisionTree extends Serializable with Logging { } } - for (node <- 0 until numNodes){ + // Calculate best splits for all nodes at a given level + val bestSplits = new Array[(Split, InformationGainStats)](numNodes) + // Iterating over all nodes at this level + var node = 0 + while (node < numNodes){ val nodeImpurityIndex = scala.math.pow(2, level).toInt - 1 + node val binsForNode: Array[Double] = getBinDataForNode(node) logDebug("nodeImpurityIndex = " + nodeImpurityIndex) val parentNodeImpurity = parentImpurities(nodeImpurityIndex) logDebug("node impurity = " + parentNodeImpurity) bestSplits(node) = binsToBestSplit(binsForNode, parentNodeImpurity) + node += 1 } + + //Return best splits bestSplits } @@ -865,12 +962,15 @@ object DecisionTree extends Serializable with Logging { val splits = Array.ofDim[Split](numFeatures, numBins-1) val bins = Array.ofDim[Bin](numFeatures, numBins) - //Find all splits - for (featureIndex <- 0 until numFeatures){ + // Find all splits + + // Iterating over all features + var featureIndex = 0 + while (featureIndex < numFeatures){ + // Checking whether the feature is continuous val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty if (isFeatureContinuous) { val featureSamples = sampledInput.map(lp => lp.features(featureIndex)).sorted - val stride: Double = numSamples.toDouble/numBins logDebug("stride = " + stride) for (index <- 0 until numBins-1) { @@ -880,15 +980,16 @@ object DecisionTree extends Serializable with Logging { } } else { val maxFeatureValue = strategy.categoricalFeaturesInfo(featureIndex) - require(maxFeatureValue < numBins, "number of categories should be less than number " + "of bins") + // For categorical variables, each bin is a category. The bins are sorted and they + // are ordered by calculating the centriod of their corresponding labels. val centriodForCategories = sampledInput.map(lp => (lp.features(featureIndex),lp.label)) .groupBy(_._1).mapValues(x => x.map(_._2).sum / x.map(_._1).length) - // Checking for missing categorical variables + // Checking for missing categorical variables and putting them last in the sorted list val fullCentriodForCategories = scala.collection.mutable.Map[Double,Double]() for (i <- 0 until maxFeatureValue) { if (centriodForCategories.contains(i)) { @@ -898,6 +999,7 @@ object DecisionTree extends Serializable with Logging { } } + //bins sorted by centriods val categoriesSortedByCentriod = fullCentriodForCategories.toList.sortBy{_._2} @@ -922,10 +1024,12 @@ object DecisionTree extends Serializable with Logging { } } } + featureIndex += 1 } // Find all bins - for (featureIndex <- 0 until numFeatures){ + featureIndex = 0 + while (featureIndex < numFeatures){ val isFeatureContinuous = strategy.categoricalFeaturesInfo.get(featureIndex).isEmpty if (isFeatureContinuous) { // bins for categorical variables are already assigned bins(featureIndex)(0) @@ -940,6 +1044,7 @@ object DecisionTree extends Serializable with Logging { = new Bin(splits(featureIndex)(numBins-2),new DummyHighSplit(featureIndex, Continuous), Continuous, Double.MinValue) } + featureIndex += 1 } (splits,bins) }