diff --git a/Directory.Build.targets b/Directory.Build.targets
index 1ab549e60a..5e6446add9 100644
--- a/Directory.Build.targets
+++ b/Directory.Build.targets
@@ -5,5 +5,33 @@
Text="The tools directory [$(ToolsDir)] does not exist. Please run build in the root of the repo to ensure the tools are installed before attempting to build an individual project." />
+
+
+
+ lib
+ .dll
+ .so
+ .dylib
+
+
+
+
+ $(NativeOutputPath)$(LibPrefix)%(NativeAssemblyReference.Identity)$(LibExtension)
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/Microsoft.ML.Console/Console.cs b/src/Microsoft.ML.Console/Console.cs
index 12e6254cce..152d65951a 100644
--- a/src/Microsoft.ML.Console/Console.cs
+++ b/src/Microsoft.ML.Console/Console.cs
@@ -8,4 +8,4 @@ public static class Console
{
public static int Main(string[] args) => Maml.Main(args);
}
-}
+}
\ No newline at end of file
diff --git a/src/Microsoft.ML.Console/Microsoft.ML.Console.csproj b/src/Microsoft.ML.Console/Microsoft.ML.Console.csproj
index 25c51de69f..1256bf75ba 100644
--- a/src/Microsoft.ML.Console/Microsoft.ML.Console.csproj
+++ b/src/Microsoft.ML.Console/Microsoft.ML.Console.csproj
@@ -3,17 +3,34 @@
true
CORECLR
- netcoreapp2.0
- Exe
- MML
- Microsoft.ML.Runtime.Tools.Console.Console
+ netcoreapp2.0
+ Exe
+ MML
+ Microsoft.ML.Runtime.Tools.Console.Console
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
index 9cecb0da6b..4d7e067f66 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmBinaryTrainer.cs
@@ -128,7 +128,7 @@ public static partial class LightGbm
{
[TlcModule.EntryPoint(
Name = "Trainers.LightGbmBinaryClassifier",
- Desc = "Train an LightGBM binary class model",
+ Desc = "Train a LightGBM binary class model.",
UserName = LightGbmBinaryTrainer.Summary,
ShortName = LightGbmBinaryTrainer.ShortName)]
public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LightGbmArguments input)
diff --git a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
index 5acd90e83d..48a208e05c 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmMulticlassTrainer.cs
@@ -180,7 +180,7 @@ public static partial class LightGbm
{
[TlcModule.EntryPoint(
Name = "Trainers.LightGbmClassifier",
- Desc = "Train an LightGBM multi class model",
+ Desc = "Train a LightGBM multi class model.",
UserName = LightGbmMulticlassTrainer.Summary,
ShortName = LightGbmMulticlassTrainer.ShortName)]
public static CommonOutputs.MulticlassClassificationOutput TrainMultiClass(IHostEnvironment env, LightGbmArguments input)
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
index b2b82fbd24..64579b3315 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRankingTrainer.cs
@@ -127,7 +127,11 @@ protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, Role
///
public static partial class LightGbm
{
- [TlcModule.EntryPoint(Name = "Trainers.LightGbmRanker", Desc = "Train an LightGBM ranking model", UserName = LightGbmRankingTrainer.Summary, ShortName = LightGbmRankingTrainer.ShortName)]
+ [TlcModule.EntryPoint(
+ Name = "Trainers.LightGbmRanker",
+ Desc = "Train a LightGBM ranking model.",
+ UserName = LightGbmRankingTrainer.Summary,
+ ShortName = LightGbmRankingTrainer.ShortName)]
public static CommonOutputs.RankingOutput TrainRanking(IHostEnvironment env, LightGbmArguments input)
{
Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
index 5024009f98..461110ae59 100644
--- a/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
+++ b/src/Microsoft.ML.LightGBM/LightGbmRegressionTrainer.cs
@@ -120,7 +120,11 @@ protected override void CheckAndUpdateParametersBeforeTraining(IChannel ch, Role
///
public static partial class LightGbm
{
- [TlcModule.EntryPoint(Name = "Trainers.LightGbmRegressor", Desc = LightGbmRegressorTrainer.Summary, UserName = LightGbmRegressorTrainer.UserNameValue, ShortName = LightGbmRegressorTrainer.ShortName)]
+ [TlcModule.EntryPoint(
+ Name = "Trainers.LightGbmRegressor",
+ Desc = LightGbmRegressorTrainer.Summary,
+ UserName = LightGbmRegressorTrainer.UserNameValue,
+ ShortName = LightGbmRegressorTrainer.ShortName)]
public static CommonOutputs.RegressionOutput TrainRegression(IHostEnvironment env, LightGbmArguments input)
{
Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index 192e7f2b5a..8080deac26 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -7320,7 +7320,7 @@ public enum LightGbmArgumentsEvalMetricType
///
- /// Train an LightGBM binary class model
+ /// Train a LightGBM binary class model.
///
public sealed partial class LightGbmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
{
@@ -7525,7 +7525,7 @@ namespace Trainers
{
///
- /// Train an LightGBM multi class model
+ /// Train a LightGBM multi class model.
///
public sealed partial class LightGbmClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
{
@@ -7730,7 +7730,7 @@ namespace Trainers
{
///
- /// Train an LightGBM ranking model
+ /// Train a LightGBM ranking model.
///
public sealed partial class LightGbmRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
{
diff --git a/src/Microsoft.ML/Trainers/LightGBM.cs b/src/Microsoft.ML/Trainers/LightGBM.cs
new file mode 100644
index 0000000000..60c7a45177
--- /dev/null
+++ b/src/Microsoft.ML/Trainers/LightGBM.cs
@@ -0,0 +1,58 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Trainers
+{
+ ///
+ /// This API requires Microsoft.ML.LightGBM nuget.
+ ///
+ ///
+ ///
+ /// pipeline.Add(new LightGbmBinaryClassifier() { NumLeaves = 5, NumBoostRound = 5, MinDataPerLeaf = 2 })
+ ///
+ ///
+ public sealed partial class LightGbmBinaryClassifier
+ {
+
+ }
+
+ ///
+ /// This API requires Microsoft.ML.LightGBM nuget.
+ ///
+ ///
+ ///
+ /// pipeline.Add(new LightGbmClassifier() { NumLeaves = 5, NumBoostRound = 5, MinDataPerLeaf = 2 })
+ ///
+ ///
+ public sealed partial class LightGbmClassifier
+ {
+
+ }
+
+ ///
+ /// This API requires Microsoft.ML.LightGBM nuget.
+ ///
+ ///
+ ///
+ /// pipeline.Add(new LightGbmRanker() { NumLeaves = 5, NumBoostRound = 5, MinDataPerLeaf = 2 })
+ ///
+ ///
+ public sealed partial class LightGbmRanker
+ {
+
+ }
+
+ ///
+ /// This API requires Microsoft.ML.LightGBM nuget.
+ ///
+ ///
+ ///
+ /// pipeline.Add(new LightGbmRegressor() { NumLeaves = 5, NumBoostRound = 5, MinDataPerLeaf = 2 })
+ ///
+ ///
+ public sealed partial class LightGbmRegressor
+ {
+
+ }
+}
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index ee0eb3de15..34e91f2d3b 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -50,6 +50,10 @@ Trainers.FieldAwareFactorizationMachineBinaryClassifier Train a field-aware fact
Trainers.GeneralizedAdditiveModelBinaryClassifier Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features. Microsoft.ML.Runtime.FastTree.Gam TrainBinary Microsoft.ML.Runtime.FastTree.BinaryClassificationGamTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
Trainers.GeneralizedAdditiveModelRegressor Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features. Microsoft.ML.Runtime.FastTree.Gam TrainRegression Microsoft.ML.Runtime.FastTree.RegressionGamTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
Trainers.KMeansPlusPlusClusterer K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers. Microsoft.ML.Runtime.KMeans.KMeansPlusPlusTrainer TrainKMeans Microsoft.ML.Runtime.KMeans.KMeansPlusPlusTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+ClusteringOutput
+Trainers.LightGbmBinaryClassifier Train a LightGBM binary class model. Microsoft.ML.Runtime.LightGBM.LightGbm TrainBinary Microsoft.ML.Runtime.LightGBM.LightGbmArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
+Trainers.LightGbmClassifier Train a LightGBM multi class model. Microsoft.ML.Runtime.LightGBM.LightGbm TrainMultiClass Microsoft.ML.Runtime.LightGBM.LightGbmArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
+Trainers.LightGbmRanker Train a LightGBM ranking model. Microsoft.ML.Runtime.LightGBM.LightGbm TrainRanking Microsoft.ML.Runtime.LightGBM.LightGbmArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RankingOutput
+Trainers.LightGbmRegressor LightGBM Regression Microsoft.ML.Runtime.LightGBM.LightGbm TrainRegression Microsoft.ML.Runtime.LightGBM.LightGbmArguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput
Trainers.LinearSvmBinaryClassifier Train a linear SVM. Microsoft.ML.Runtime.Learners.LinearSvm TrainLinearSvm Microsoft.ML.Runtime.Learners.LinearSvm+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
Trainers.LogisticRegressionBinaryClassifier Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.Wikipedia: L-BFGS.Wikipedia: Logistic regression.Scalable Training of L1-Regularized Log-Linear Models.Test Run - L1 and L2 Regularization for Machine Learning. Microsoft.ML.Runtime.Learners.LogisticRegression TrainBinary Microsoft.ML.Runtime.Learners.LogisticRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput
Trainers.LogisticRegressionClassifier Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.Wikipedia: L-BFGS.Wikipedia: Logistic regression.Scalable Training of L1-Regularized Log-Linear Models.Test Run - L1 and L2 Regularization for Machine Learning. Microsoft.ML.Runtime.Learners.LogisticRegression TrainMultiClass Microsoft.ML.Runtime.Learners.MulticlassLogisticRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput
diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
index 47ecdd2be6..d27cd1684a 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json
+++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json
@@ -10674,11 +10674,34 @@
]
},
{
- "Name": "Trainers.LinearSvmBinaryClassifier",
- "Desc": "Train a linear SVM.",
- "FriendlyName": "SVM (Pegasos-Linear)",
- "ShortName": "svm",
+ "Name": "Trainers.LightGbmBinaryClassifier",
+ "Desc": "Train a LightGBM binary class model.",
+ "FriendlyName": "LightGBM Binary Classifier",
+ "ShortName": "LightGBM",
"Inputs": [
+ {
+ "Name": "NumBoostRound",
+ "Type": "Int",
+ "Desc": "Number of iterations.",
+ "Aliases": [
+ "iter"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 100,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 20,
+ 50,
+ 100,
+ 150,
+ 200
+ ]
+ }
+ },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -10690,6 +10713,64 @@
"SortOrder": 1.0,
"IsNullable": false
},
+ {
+ "Name": "LearningRate",
+ "Type": "Float",
+ "Desc": "Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1].",
+ "Aliases": [
+ "lr"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.025,
+ "Max": 0.4,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "NumLeaves",
+ "Type": "Int",
+ "Desc": "Maximum leaves for trees.",
+ "Aliases": [
+ "nl"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 2,
+ "Max": 128,
+ "StepSize": 4.0,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "MinDataPerLeaf",
+ "Type": "Int",
+ "Desc": "Minimum number of instances needed in a child.",
+ "Aliases": [
+ "mil"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20,
+ 50
+ ]
+ }
+ },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -10702,6 +10783,20 @@
"IsNullable": false,
"Default": "Features"
},
+ {
+ "Name": "Booster",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "BoosterParameterFunction"
+ },
+ "Desc": "Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "gbdt"
+ }
+ },
{
"Name": "LabelColumn",
"Type": "String",
@@ -10714,6 +10809,30 @@
"IsNullable": false,
"Default": "Label"
},
+ {
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "Column to use for example weight",
+ "Aliases": [
+ "weight"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": "Weight"
+ },
+ {
+ "Name": "GroupIdColumn",
+ "Type": "String",
+ "Desc": "Column to use for example groupId",
+ "Aliases": [
+ "groupId"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "GroupId"
+ },
{
"Name": "NormalizeFeatures",
"Type": {
@@ -10755,175 +10874,270 @@
"Default": "Auto"
},
{
- "Name": "Lambda",
- "Type": "Float",
- "Desc": "Regularizer constant",
+ "Name": "MaxBin",
+ "Type": "Int",
+ "Desc": "Max number of bucket bin for features.",
"Aliases": [
- "lambda"
+ "mb"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.001,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 1E-05,
- "Max": 0.1,
- "StepSize": 10.0,
- "IsLogScale": true
- }
+ "Default": 255
},
{
- "Name": "PerformProjection",
+ "Name": "VerboseEval",
"Type": "Bool",
- "Desc": "Perform projection to unit-ball? Typically used with batch size > 1.",
+ "Desc": "Verbose",
"Aliases": [
- "project"
+ "v"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": false,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- false,
- true
- ]
- }
+ "Default": false
},
{
- "Name": "NumIterations",
+ "Name": "Silent",
+ "Type": "Bool",
+ "Desc": "Printing running messages.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "NThread",
"Type": "Int",
- "Desc": "Number of iterations",
+ "Desc": "Number of parallel threads used to run LightGBM.",
"Aliases": [
- "iter"
+ "nt"
],
"Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 1,
- "SweepRange": {
- "RangeType": "Long",
- "Min": 1,
- "Max": 100,
- "StepSize": 10.0,
- "IsLogScale": true
- }
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "InitWtsDiameter",
- "Type": "Float",
- "Desc": "Init weights diameter",
+ "Name": "EvalMetric",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultMetric",
+ "Rmse",
+ "Mae",
+ "Logloss",
+ "Error",
+ "Merror",
+ "Mlogloss",
+ "Auc",
+ "Ndcg",
+ "Map"
+ ]
+ },
+ "Desc": "Evaluation metrics.",
"Aliases": [
- "initwts"
+ "em"
],
"Required": false,
- "SortOrder": 140.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 5
- }
+ "Default": "DefaultMetric"
},
{
- "Name": "NoBias",
+ "Name": "UseSoftmax",
"Type": "Bool",
- "Desc": "No bias",
+ "Desc": "Use softmax loss for the multi classification.",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
- "Default": false,
+ "IsNullable": true,
+ "Default": null,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- false,
- true
+ true,
+ false
]
}
},
{
- "Name": "Calibrator",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "CalibratorTrainer"
- },
- "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
+ "Name": "EarlyStoppingRound",
+ "Type": "Int",
+ "Desc": "Rounds of early stopping, 0 will disable it.",
+ "Aliases": [
+ "es"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": {
- "Name": "PlattCalibrator"
- }
+ "Default": 0
},
{
- "Name": "MaxCalibrationExamples",
- "Type": "Int",
- "Desc": "The maximum number of examples to use when training the calibrator",
+ "Name": "CustomGains",
+ "Type": "String",
+ "Desc": "Comma seperated list of gains associated to each relevance label.",
+ "Aliases": [
+ "gains"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1000000
+ "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095"
},
{
- "Name": "InitialWeights",
- "Type": "String",
- "Desc": "Initial Weights and bias, comma-separated",
- "Aliases": [
- "initweights"
- ],
+ "Name": "BatchSize",
+ "Type": "Int",
+ "Desc": "Number of entries in a batch when loading data.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": null
+ "Default": 1048576
},
{
- "Name": "Shuffle",
+ "Name": "UseCat",
"Type": "Bool",
- "Desc": "Whether to shuffle for each training iteration",
+ "Desc": "Enable categorical split or not.",
"Aliases": [
- "shuf"
+ "cat"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
- "Default": true,
+ "IsNullable": true,
+ "Default": null,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- false,
- true
+ true,
+ false
]
}
},
{
- "Name": "StreamingCacheSize",
- "Type": "Int",
- "Desc": "Size of cache when trained in Scope",
- "Aliases": [
- "cache"
- ],
+ "Name": "UseMissing",
+ "Type": "Bool",
+ "Desc": "Enable missing value auto infer or not.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1000000
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
},
{
- "Name": "BatchSize",
+ "Name": "MinDataPerGroup",
"Type": "Int",
- "Desc": "Batch size",
+ "Desc": "Min number of instances per categorical group.",
"Aliases": [
- "batch"
+ "mdpg"
],
"Required": false,
- "SortOrder": 190.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 1
- }
- ],
+ "Default": 100,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 50,
+ 100,
+ 200
+ ]
+ }
+ },
+ {
+ "Name": "MaxCatThreshold",
+ "Type": "Int",
+ "Desc": "Max number of categorical thresholds.",
+ "Aliases": [
+ "maxcat"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 32,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 8,
+ 16,
+ 32,
+ 64
+ ]
+ }
+ },
+ {
+ "Name": "CatSmooth",
+ "Type": "Float",
+ "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20
+ ]
+ }
+ },
+ {
+ "Name": "CatL2",
+ "Type": "Float",
+ "Desc": "L2 Regularization for categorical split.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.1,
+ 0.5,
+ 1,
+ 5,
+ 10
+ ]
+ }
+ },
+ {
+ "Name": "ParallelTrainer",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "ParallelLightGBM"
+ },
+ "Desc": "Parallel LightGBM Learning Algorithm",
+ "Aliases": [
+ "parag"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "Single"
+ }
+ }
+ ],
"Outputs": [
{
"Name": "PredictorModel",
@@ -10932,6 +11146,8 @@
}
],
"InputKind": [
+ "ITrainerInputWithGroupId",
+ "ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
@@ -10941,11 +11157,34 @@
]
},
{
- "Name": "Trainers.LogisticRegressionBinaryClassifier",
- "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.Wikipedia: L-BFGS.Wikipedia: Logistic regression.Scalable Training of L1-Regularized Log-Linear Models.Test Run - L1 and L2 Regularization for Machine Learning.",
- "FriendlyName": "Logistic Regression",
- "ShortName": "lr",
+ "Name": "Trainers.LightGbmClassifier",
+ "Desc": "Train a LightGBM multi class model.",
+ "FriendlyName": "LightGBM Multi Class Classifier",
+ "ShortName": "LightGBMMC",
"Inputs": [
+ {
+ "Name": "NumBoostRound",
+ "Type": "Int",
+ "Desc": "Number of iterations.",
+ "Aliases": [
+ "iter"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 100,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 20,
+ 50,
+ 100,
+ 150,
+ 200
+ ]
+ }
+ },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -10957,6 +11196,64 @@
"SortOrder": 1.0,
"IsNullable": false
},
+ {
+ "Name": "LearningRate",
+ "Type": "Float",
+ "Desc": "Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1].",
+ "Aliases": [
+ "lr"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.025,
+ "Max": 0.4,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "NumLeaves",
+ "Type": "Int",
+ "Desc": "Maximum leaves for trees.",
+ "Aliases": [
+ "nl"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 2,
+ "Max": 128,
+ "StepSize": 4.0,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "MinDataPerLeaf",
+ "Type": "Int",
+ "Desc": "Minimum number of instances needed in a child.",
+ "Aliases": [
+ "mil"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20,
+ 50
+ ]
+ }
+ },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -10969,6 +11266,20 @@
"IsNullable": false,
"Default": "Features"
},
+ {
+ "Name": "Booster",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "BoosterParameterFunction"
+ },
+ "Desc": "Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "gbdt"
+ }
+ },
{
"Name": "LabelColumn",
"Type": "String",
@@ -10993,6 +11304,18 @@
"IsNullable": false,
"Default": "Weight"
},
+ {
+ "Name": "GroupIdColumn",
+ "Type": "String",
+ "Desc": "Column to use for example groupId",
+ "Aliases": [
+ "groupId"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "GroupId"
+ },
{
"Name": "NormalizeFeatures",
"Type": {
@@ -11034,205 +11357,268 @@
"Default": "Auto"
},
{
- "Name": "ShowTrainingStats",
- "Type": "Bool",
- "Desc": "Show statistics of training examples.",
+ "Name": "MaxBin",
+ "Type": "Int",
+ "Desc": "Max number of bucket bin for features.",
"Aliases": [
- "stat"
+ "mb"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": 255
},
{
- "Name": "L2Weight",
- "Type": "Float",
- "Desc": "L2 regularization weight",
+ "Name": "VerboseEval",
+ "Type": "Bool",
+ "Desc": "Verbose",
"Aliases": [
- "l2"
+ "v"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 1.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 4
- }
+ "Default": false
},
{
- "Name": "L1Weight",
- "Type": "Float",
- "Desc": "L1 regularization weight",
+ "Name": "Silent",
+ "Type": "Bool",
+ "Desc": "Printing running messages.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "NThread",
+ "Type": "Int",
+ "Desc": "Number of parallel threads used to run LightGBM.",
"Aliases": [
- "l1"
+ "nt"
],
"Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 1.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 4
- }
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "OptTol",
- "Type": "Float",
- "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate",
+ "Name": "EvalMetric",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultMetric",
+ "Rmse",
+ "Mae",
+ "Logloss",
+ "Error",
+ "Merror",
+ "Mlogloss",
+ "Auc",
+ "Ndcg",
+ "Map"
+ ]
+ },
+ "Desc": "Evaluation metrics.",
"Aliases": [
- "ot"
+ "em"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 1E-07,
+ "Default": "DefaultMetric"
+ },
+ {
+ "Name": "UseSoftmax",
+ "Type": "Bool",
+ "Desc": "Use softmax loss for the multi classification.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 0.0001,
- 1E-07
+ true,
+ false
]
}
},
{
- "Name": "MemorySize",
+ "Name": "EarlyStoppingRound",
"Type": "Int",
- "Desc": "Memory size for L-BFGS. Lower=faster, less accurate",
+ "Desc": "Rounds of early stopping, 0 will disable it.",
"Aliases": [
- "m"
+ "es"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 20,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 5,
- 20,
- 50
- ]
- }
+ "Default": 0
},
{
- "Name": "EnforceNonNegativity",
- "Type": "Bool",
- "Desc": "Enforce non-negative weights",
+ "Name": "CustomGains",
+ "Type": "String",
+ "Desc": "Comma seperated list of gains associated to each relevance label.",
"Aliases": [
- "nn"
+ "gains"
],
"Required": false,
- "SortOrder": 90.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095"
},
{
- "Name": "InitWtsDiameter",
- "Type": "Float",
- "Desc": "Init weights diameter",
- "Aliases": [
- "initwts"
- ],
+ "Name": "BatchSize",
+ "Type": "Int",
+ "Desc": "Number of entries in a batch when loading data.",
"Required": false,
- "SortOrder": 140.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 5
- }
+ "Default": 1048576
},
{
- "Name": "MaxIterations",
- "Type": "Int",
- "Desc": "Maximum iterations.",
+ "Name": "UseCat",
+ "Type": "Bool",
+ "Desc": "Enable categorical split or not.",
"Aliases": [
- "maxiter"
+ "cat"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
- "Default": 2147483647,
+ "IsNullable": true,
+ "Default": null,
"SweepRange": {
- "RangeType": "Long",
- "Min": 1,
- "Max": 2147483647
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
}
},
{
- "Name": "SgdInitializationTolerance",
- "Type": "Float",
- "Desc": "Run SGD to initialize LR weights, converging to this tolerance",
- "Aliases": [
- "sgd"
- ],
+ "Name": "UseMissing",
+ "Type": "Bool",
+ "Desc": "Enable missing value auto infer or not.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
},
{
- "Name": "Quiet",
- "Type": "Bool",
- "Desc": "If set to true, produce no output during training.",
+ "Name": "MinDataPerGroup",
+ "Type": "Int",
+ "Desc": "Min number of instances per categorical group.",
"Aliases": [
- "q"
+ "mdpg"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": 100,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 50,
+ 100,
+ 200
+ ]
+ }
},
{
- "Name": "UseThreads",
- "Type": "Bool",
- "Desc": "Whether or not to use threads. Default is true",
+ "Name": "MaxCatThreshold",
+ "Type": "Int",
+ "Desc": "Max number of categorical thresholds.",
"Aliases": [
- "t"
+ "maxcat"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true
+ "Default": 32,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 8,
+ 16,
+ 32,
+ 64
+ ]
+ }
},
{
- "Name": "NumThreads",
- "Type": "Int",
- "Desc": "Number of threads",
- "Aliases": [
- "nt"
- ],
+ "Name": "CatSmooth",
+ "Type": "Float",
+ "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
+ "IsNullable": false,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20
+ ]
+ }
},
{
- "Name": "DenseOptimizer",
- "Type": "Bool",
- "Desc": "Force densification of the internal optimization vectors",
- "Aliases": [
- "do"
- ],
+ "Name": "CatL2",
+ "Type": "Float",
+ "Desc": "L2 Regularization for categorical split.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- false,
- true
+ 0.1,
+ 0.5,
+ 1,
+ 5,
+ 10
]
}
+ },
+ {
+ "Name": "ParallelTrainer",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "ParallelLightGBM"
+ },
+ "Desc": "Parallel LightGBM Learning Algorithm",
+ "Aliases": [
+ "parag"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "Single"
+ }
}
],
"Outputs": [
@@ -11243,21 +11629,45 @@
}
],
"InputKind": [
+ "ITrainerInputWithGroupId",
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
"OutputKind": [
- "IBinaryClassificationOutput",
+ "IMulticlassClassificationOutput",
"ITrainerOutput"
]
},
{
- "Name": "Trainers.LogisticRegressionClassifier",
- "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.Wikipedia: L-BFGS.Wikipedia: Logistic regression.Scalable Training of L1-Regularized Log-Linear Models.Test Run - L1 and L2 Regularization for Machine Learning.",
- "FriendlyName": "Multi-class Logistic Regression",
- "ShortName": "mlr",
+ "Name": "Trainers.LightGbmRanker",
+ "Desc": "Train a LightGBM ranking model.",
+ "FriendlyName": "LightGBM Ranking",
+ "ShortName": "LightGBMRank",
"Inputs": [
+ {
+ "Name": "NumBoostRound",
+ "Type": "Int",
+ "Desc": "Number of iterations.",
+ "Aliases": [
+ "iter"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 100,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 20,
+ 50,
+ 100,
+ 150,
+ 200
+ ]
+ }
+ },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -11269,6 +11679,64 @@
"SortOrder": 1.0,
"IsNullable": false
},
+ {
+ "Name": "LearningRate",
+ "Type": "Float",
+ "Desc": "Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1].",
+ "Aliases": [
+ "lr"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.025,
+ "Max": 0.4,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "NumLeaves",
+ "Type": "Int",
+ "Desc": "Maximum leaves for trees.",
+ "Aliases": [
+ "nl"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 2,
+ "Max": 128,
+ "StepSize": 4.0,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "MinDataPerLeaf",
+ "Type": "Int",
+ "Desc": "Minimum number of instances needed in a child.",
+ "Aliases": [
+ "mil"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20,
+ 50
+ ]
+ }
+ },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -11281,6 +11749,20 @@
"IsNullable": false,
"Default": "Features"
},
+ {
+ "Name": "Booster",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "BoosterParameterFunction"
+ },
+ "Desc": "Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "gbdt"
+ }
+ },
{
"Name": "LabelColumn",
"Type": "String",
@@ -11306,14 +11788,26 @@
"Default": "Weight"
},
{
- "Name": "NormalizeFeatures",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "No",
- "Warn",
- "Auto",
- "Yes"
+ "Name": "GroupIdColumn",
+ "Type": "String",
+ "Desc": "Column to use for example groupId",
+ "Aliases": [
+ "groupId"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "GroupId"
+ },
+ {
+ "Name": "NormalizeFeatures",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
]
},
"Desc": "Normalize option for the feature column",
@@ -11346,205 +11840,268 @@
"Default": "Auto"
},
{
- "Name": "ShowTrainingStats",
- "Type": "Bool",
- "Desc": "Show statistics of training examples.",
+ "Name": "MaxBin",
+ "Type": "Int",
+ "Desc": "Max number of bucket bin for features.",
"Aliases": [
- "stat"
+ "mb"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": 255
},
{
- "Name": "L2Weight",
- "Type": "Float",
- "Desc": "L2 regularization weight",
+ "Name": "VerboseEval",
+ "Type": "Bool",
+ "Desc": "Verbose",
"Aliases": [
- "l2"
+ "v"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 1.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 4
- }
+ "Default": false
},
{
- "Name": "L1Weight",
- "Type": "Float",
- "Desc": "L1 regularization weight",
- "Aliases": [
- "l1"
- ],
+ "Name": "Silent",
+ "Type": "Bool",
+ "Desc": "Printing running messages.",
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 1.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 4
- }
+ "Default": true
},
{
- "Name": "OptTol",
- "Type": "Float",
- "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate",
+ "Name": "NThread",
+ "Type": "Int",
+ "Desc": "Number of parallel threads used to run LightGBM.",
"Aliases": [
- "ot"
+ "nt"
],
"Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 1E-07,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 0.0001,
- 1E-07
- ]
- }
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "MemorySize",
- "Type": "Int",
- "Desc": "Memory size for L-BFGS. Lower=faster, less accurate",
+ "Name": "EvalMetric",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultMetric",
+ "Rmse",
+ "Mae",
+ "Logloss",
+ "Error",
+ "Merror",
+ "Mlogloss",
+ "Auc",
+ "Ndcg",
+ "Map"
+ ]
+ },
+ "Desc": "Evaluation metrics.",
"Aliases": [
- "m"
+ "em"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 20,
+ "Default": "DefaultMetric"
+ },
+ {
+ "Name": "UseSoftmax",
+ "Type": "Bool",
+ "Desc": "Use softmax loss for the multi classification.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 5,
- 20,
- 50
+ true,
+ false
]
}
},
{
- "Name": "EnforceNonNegativity",
- "Type": "Bool",
- "Desc": "Enforce non-negative weights",
+ "Name": "EarlyStoppingRound",
+ "Type": "Int",
+ "Desc": "Rounds of early stopping, 0 will disable it.",
"Aliases": [
- "nn"
+ "es"
],
"Required": false,
- "SortOrder": 90.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": 0
},
{
- "Name": "InitWtsDiameter",
- "Type": "Float",
- "Desc": "Init weights diameter",
+ "Name": "CustomGains",
+ "Type": "String",
+ "Desc": "Comma seperated list of gains associated to each relevance label.",
"Aliases": [
- "initwts"
+ "gains"
],
"Required": false,
- "SortOrder": 140.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0,
- "SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 5
- }
+ "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095"
},
{
- "Name": "MaxIterations",
+ "Name": "BatchSize",
"Type": "Int",
- "Desc": "Maximum iterations.",
- "Aliases": [
- "maxiter"
- ],
+ "Desc": "Number of entries in a batch when loading data.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 2147483647,
- "SweepRange": {
- "RangeType": "Long",
- "Min": 1,
- "Max": 2147483647
- }
+ "Default": 1048576
},
{
- "Name": "SgdInitializationTolerance",
- "Type": "Float",
- "Desc": "Run SGD to initialize LR weights, converging to this tolerance",
+ "Name": "UseCat",
+ "Type": "Bool",
+ "Desc": "Enable categorical split or not.",
"Aliases": [
- "sgd"
+ "cat"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.0
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
},
{
- "Name": "Quiet",
+ "Name": "UseMissing",
"Type": "Bool",
- "Desc": "If set to true, produce no output during training.",
- "Aliases": [
- "q"
- ],
+ "Desc": "Enable missing value auto infer or not.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
},
{
- "Name": "UseThreads",
- "Type": "Bool",
- "Desc": "Whether or not to use threads. Default is true",
+ "Name": "MinDataPerGroup",
+ "Type": "Int",
+ "Desc": "Min number of instances per categorical group.",
"Aliases": [
- "t"
+ "mdpg"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true
+ "Default": 100,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 50,
+ 100,
+ 200
+ ]
+ }
},
{
- "Name": "NumThreads",
+ "Name": "MaxCatThreshold",
"Type": "Int",
- "Desc": "Number of threads",
+ "Desc": "Max number of categorical thresholds.",
"Aliases": [
- "nt"
+ "maxcat"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
+ "IsNullable": false,
+ "Default": 32,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 8,
+ 16,
+ 32,
+ 64
+ ]
+ }
},
{
- "Name": "DenseOptimizer",
- "Type": "Bool",
- "Desc": "Force densification of the internal optimization vectors",
- "Aliases": [
- "do"
- ],
+ "Name": "CatSmooth",
+ "Type": "Float",
+ "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- false,
- true
+ 1,
+ 10,
+ 20
+ ]
+ }
+ },
+ {
+ "Name": "CatL2",
+ "Type": "Float",
+ "Desc": "L2 Regularization for categorical split.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.1,
+ 0.5,
+ 1,
+ 5,
+ 10
]
}
+ },
+ {
+ "Name": "ParallelTrainer",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "ParallelLightGBM"
+ },
+ "Desc": "Parallel LightGBM Learning Algorithm",
+ "Aliases": [
+ "parag"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "Single"
+ }
}
],
"Outputs": [
@@ -11555,21 +12112,45 @@
}
],
"InputKind": [
+ "ITrainerInputWithGroupId",
"ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
"OutputKind": [
- "IMulticlassClassificationOutput",
+ "IRankingOutput",
"ITrainerOutput"
]
},
{
- "Name": "Trainers.NaiveBayesClassifier",
- "Desc": "Train a MultiClassNaiveBayesTrainer.",
- "FriendlyName": "Multiclass Naive Bayes",
- "ShortName": "MNB",
+ "Name": "Trainers.LightGbmRegressor",
+ "Desc": "LightGBM Regression",
+ "FriendlyName": "LightGBM Regressor",
+ "ShortName": "LightGBMR",
"Inputs": [
+ {
+ "Name": "NumBoostRound",
+ "Type": "Int",
+ "Desc": "Number of iterations.",
+ "Aliases": [
+ "iter"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 100,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 20,
+ 50,
+ 100,
+ 150,
+ 200
+ ]
+ }
+ },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -11581,6 +12162,64 @@
"SortOrder": 1.0,
"IsNullable": false
},
+ {
+ "Name": "LearningRate",
+ "Type": "Float",
+ "Desc": "Shrinkage rate for trees, used to prevent over-fitting. Range: (0,1].",
+ "Aliases": [
+ "lr"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.025,
+ "Max": 0.4,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "NumLeaves",
+ "Type": "Int",
+ "Desc": "Maximum leaves for trees.",
+ "Aliases": [
+ "nl"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 2,
+ "Max": 128,
+ "StepSize": 4.0,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "MinDataPerLeaf",
+ "Type": "Int",
+ "Desc": "Minimum number of instances needed in a child.",
+ "Aliases": [
+ "mil"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20,
+ 50
+ ]
+ }
+ },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -11593,6 +12232,20 @@
"IsNullable": false,
"Default": "Features"
},
+ {
+ "Name": "Booster",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "BoosterParameterFunction"
+ },
+ "Desc": "Which booster to use, can be gbtree, gblinear or dart. gbtree and dart use tree based model while gblinear uses linear function.",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "gbdt"
+ }
+ },
{
"Name": "LabelColumn",
"Type": "String",
@@ -11605,6 +12258,30 @@
"IsNullable": false,
"Default": "Label"
},
+ {
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "Column to use for example weight",
+ "Aliases": [
+ "weight"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": "Weight"
+ },
+ {
+ "Name": "GroupIdColumn",
+ "Type": "String",
+ "Desc": "Column to use for example groupId",
+ "Aliases": [
+ "groupId"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "GroupId"
+ },
{
"Name": "NormalizeFeatures",
"Type": {
@@ -11644,330 +12321,270 @@
"SortOrder": 6.0,
"IsNullable": false,
"Default": "Auto"
- }
- ],
- "Outputs": [
- {
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "The trained model"
- }
- ],
- "InputKind": [
- "ITrainerInputWithLabel",
- "ITrainerInput"
- ],
- "OutputKind": [
- "IMulticlassClassificationOutput",
- "ITrainerOutput"
- ]
- },
- {
- "Name": "Trainers.OnlineGradientDescentRegressor",
- "Desc": "Train a Online gradient descent perceptron.",
- "FriendlyName": "Stochastic Gradient Descent (Regression)",
- "ShortName": "ogd",
- "Inputs": [
+ },
{
- "Name": "TrainingData",
- "Type": "DataView",
- "Desc": "The data to be used for training",
+ "Name": "MaxBin",
+ "Type": "Int",
+ "Desc": "Max number of bucket bin for features.",
"Aliases": [
- "data"
+ "mb"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 255
},
{
- "Name": "FeatureColumn",
- "Type": "String",
- "Desc": "Column to use for features",
+ "Name": "VerboseEval",
+ "Type": "Bool",
+ "Desc": "Verbose",
"Aliases": [
- "feat"
+ "v"
],
"Required": false,
- "SortOrder": 2.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": "Features"
+ "Default": false
},
{
- "Name": "LabelColumn",
- "Type": "String",
- "Desc": "Column to use for labels",
+ "Name": "Silent",
+ "Type": "Bool",
+ "Desc": "Printing running messages.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "NThread",
+ "Type": "Int",
+ "Desc": "Number of parallel threads used to run LightGBM.",
"Aliases": [
- "lab"
+ "nt"
],
"Required": false,
- "SortOrder": 3.0,
- "IsNullable": false,
- "Default": "Label"
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "NormalizeFeatures",
+ "Name": "EvalMetric",
"Type": {
"Kind": "Enum",
"Values": [
- "No",
- "Warn",
- "Auto",
- "Yes"
+ "DefaultMetric",
+ "Rmse",
+ "Mae",
+ "Logloss",
+ "Error",
+ "Merror",
+ "Mlogloss",
+ "Auc",
+ "Ndcg",
+ "Map"
]
},
- "Desc": "Normalize option for the feature column",
+ "Desc": "Evaluation metrics.",
"Aliases": [
- "norm"
+ "em"
],
"Required": false,
- "SortOrder": 5.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": "Auto"
+ "Default": "DefaultMetric"
},
{
- "Name": "Caching",
- "Type": {
- "Kind": "Enum",
+ "Name": "UseSoftmax",
+ "Type": "Bool",
+ "Desc": "Use softmax loss for the multi classification.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
"Values": [
- "Auto",
- "Memory",
- "Disk",
- "None"
+ true,
+ false
]
- },
- "Desc": "Whether learner should cache input training data",
+ }
+ },
+ {
+ "Name": "EarlyStoppingRound",
+ "Type": "Int",
+ "Desc": "Rounds of early stopping, 0 will disable it.",
"Aliases": [
- "cache"
+ "es"
],
"Required": false,
- "SortOrder": 6.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": "Auto"
+ "Default": 0
},
{
- "Name": "LossFunction",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "RegressionLossFunction"
- },
- "Desc": "Loss Function",
+ "Name": "CustomGains",
+ "Type": "String",
+ "Desc": "Comma seperated list of gains associated to each relevance label.",
"Aliases": [
- "loss"
+ "gains"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": {
- "Name": "SquaredLoss"
- }
+ "Default": "0,3,7,15,31,63,127,255,511,1023,2047,4095"
},
{
- "Name": "LearningRate",
- "Type": "Float",
- "Desc": "Learning rate",
+ "Name": "BatchSize",
+ "Type": "Int",
+ "Desc": "Number of entries in a batch when loading data.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1048576
+ },
+ {
+ "Name": "UseCat",
+ "Type": "Bool",
+ "Desc": "Enable categorical split or not.",
"Aliases": [
- "lr"
+ "cat"
],
"Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 0.1,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 0.01,
- 0.1,
- 0.5,
- 1.0
+ true,
+ false
]
}
},
{
- "Name": "DecreaseLearningRate",
+ "Name": "UseMissing",
"Type": "Bool",
- "Desc": "Decrease learning rate",
- "Aliases": [
- "decreaselr"
- ],
+ "Desc": "Enable missing value auto infer or not.",
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": true,
+ "Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- false,
- true
+ true,
+ false
]
}
},
{
- "Name": "L2RegularizerWeight",
- "Type": "Float",
- "Desc": "L2 Regularization Weight",
+ "Name": "MinDataPerGroup",
+ "Type": "Int",
+ "Desc": "Min number of instances per categorical group.",
"Aliases": [
- "reg"
+ "mdpg"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0,
+ "Default": 100,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
"SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 0.5
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 50,
+ 100,
+ 200
+ ]
}
},
{
- "Name": "NumIterations",
+ "Name": "MaxCatThreshold",
"Type": "Int",
- "Desc": "Number of iterations",
+ "Desc": "Max number of categorical thresholds.",
"Aliases": [
- "iter"
+ "maxcat"
],
"Required": false,
- "SortOrder": 50.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 1,
+ "Default": 32,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ },
"SweepRange": {
- "RangeType": "Long",
- "Min": 1,
- "Max": 100,
- "StepSize": 10.0,
- "IsLogScale": true
+ "RangeType": "Discrete",
+ "Values": [
+ 8,
+ 16,
+ 32,
+ 64
+ ]
}
},
{
- "Name": "InitWtsDiameter",
+ "Name": "CatSmooth",
"Type": "Float",
- "Desc": "Init weights diameter",
- "Aliases": [
- "initwts"
- ],
+ "Desc": "Lapalace smooth term in categorical feature spilt. Avoid the bias of small categories.",
"Required": false,
- "SortOrder": 140.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
"SweepRange": {
- "RangeType": "Float",
- "Min": 0.0,
- "Max": 1.0,
- "NumSteps": 5
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 20
+ ]
}
},
{
- "Name": "ResetWeightsAfterXExamples",
- "Type": "Int",
- "Desc": "Number of examples after which weights will be reset to the current average",
- "Aliases": [
- "numreset"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "DoLazyUpdates",
- "Type": "Bool",
- "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero",
- "Aliases": [
- "lazy"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": true
- },
- {
- "Name": "RecencyGain",
- "Type": "Float",
- "Desc": "Extra weight given to more recent updates",
- "Aliases": [
- "rg"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.0
- },
- {
- "Name": "RecencyGainMulti",
- "Type": "Bool",
- "Desc": "Whether Recency Gain is multiplicative (vs. additive)",
- "Aliases": [
- "rgm"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": false
- },
- {
- "Name": "Averaged",
- "Type": "Bool",
- "Desc": "Do averaging?",
- "Aliases": [
- "avg"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": true
- },
- {
- "Name": "AveragedTolerance",
+ "Name": "CatL2",
"Type": "Float",
- "Desc": "The inexactness tolerance for averaging",
- "Aliases": [
- "avgtol"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.01
- },
- {
- "Name": "InitialWeights",
- "Type": "String",
- "Desc": "Initial Weights and bias, comma-separated",
- "Aliases": [
- "initweights"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Shuffle",
- "Type": "Bool",
- "Desc": "Whether to shuffle for each training iteration",
- "Aliases": [
- "shuf"
- ],
+ "Desc": "L2 Regularization for categorical split.",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true,
+ "Default": 10.0,
+ "Range": {
+ "Min": 0.0
+ },
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- false,
- true
+ 0.1,
+ 0.5,
+ 1,
+ 5,
+ 10
]
}
},
{
- "Name": "StreamingCacheSize",
- "Type": "Int",
- "Desc": "Size of cache when trained in Scope",
+ "Name": "ParallelTrainer",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "ParallelLightGBM"
+ },
+ "Desc": "Parallel LightGBM Learning Algorithm",
"Aliases": [
- "cache"
+ "parag"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1000000
+ "Default": {
+ "Name": "Single"
+ }
}
],
"Outputs": [
@@ -11978,6 +12595,8 @@
}
],
"InputKind": [
+ "ITrainerInputWithGroupId",
+ "ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
@@ -11987,10 +12606,10 @@
]
},
{
- "Name": "Trainers.PcaAnomalyDetector",
- "Desc": "Train an PCA Anomaly model.",
- "FriendlyName": "PCA Anomaly Detector",
- "ShortName": "pcaAnom",
+ "Name": "Trainers.LinearSvmBinaryClassifier",
+ "Desc": "Train a linear SVM.",
+ "FriendlyName": "SVM (Pegasos-Linear)",
+ "ShortName": "svm",
"Inputs": [
{
"Name": "TrainingData",
@@ -12016,16 +12635,16 @@
"Default": "Features"
},
{
- "Name": "WeightColumn",
+ "Name": "LabelColumn",
"Type": "String",
- "Desc": "Column to use for example weight",
+ "Desc": "Column to use for labels",
"Aliases": [
- "weight"
+ "lab"
],
"Required": false,
- "SortOrder": 4.0,
+ "SortOrder": 3.0,
"IsNullable": false,
- "Default": "Weight"
+ "Default": "Label"
},
{
"Name": "NormalizeFeatures",
@@ -12068,54 +12687,35 @@
"Default": "Auto"
},
{
- "Name": "Rank",
- "Type": "Int",
- "Desc": "The number of components in the PCA",
+ "Name": "Lambda",
+ "Type": "Float",
+ "Desc": "Regularizer constant",
"Aliases": [
- "k"
+ "lambda"
],
"Required": false,
"SortOrder": 50.0,
"IsNullable": false,
- "Default": 20,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 10,
- 20,
- 40,
- 80
- ]
- }
- },
- {
- "Name": "Oversampling",
- "Type": "Int",
- "Desc": "Oversampling parameter for randomized PCA training",
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 20,
+ "Default": 0.001,
"SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 10,
- 20,
- 40
- ]
+ "RangeType": "Float",
+ "Min": 1E-05,
+ "Max": 0.1,
+ "StepSize": 10.0,
+ "IsLogScale": true
}
},
{
- "Name": "Center",
+ "Name": "PerformProjection",
"Type": "Bool",
- "Desc": "If enabled, data is centered to be zero mean",
+ "Desc": "Perform projection to unit-ball? Typically used with batch size > 1.",
"Aliases": [
- "center"
+ "project"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": true,
+ "Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
@@ -12125,17 +12725,136 @@
}
},
{
- "Name": "Seed",
+ "Name": "NumIterations",
"Type": "Int",
- "Desc": "The seed for random number generation",
+ "Desc": "Number of iterations",
"Aliases": [
- "seed"
+ "iter"
],
"Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- }
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 1,
+ "Max": 100,
+ "StepSize": 10.0,
+ "IsLogScale": true
+ }
+ },
+ {
+ "Name": "InitWtsDiameter",
+ "Type": "Float",
+ "Desc": "Init weights diameter",
+ "Aliases": [
+ "initwts"
+ ],
+ "Required": false,
+ "SortOrder": 140.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 5
+ }
+ },
+ {
+ "Name": "NoBias",
+ "Type": "Bool",
+ "Desc": "No bias",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
+ },
+ {
+ "Name": "Calibrator",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "CalibratorTrainer"
+ },
+ "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "PlattCalibrator"
+ }
+ },
+ {
+ "Name": "MaxCalibrationExamples",
+ "Type": "Int",
+ "Desc": "The maximum number of examples to use when training the calibrator",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000
+ },
+ {
+ "Name": "InitialWeights",
+ "Type": "String",
+ "Desc": "Initial Weights and bias, comma-separated",
+ "Aliases": [
+ "initweights"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Shuffle",
+ "Type": "Bool",
+ "Desc": "Whether to shuffle for each training iteration",
+ "Aliases": [
+ "shuf"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
+ },
+ {
+ "Name": "StreamingCacheSize",
+ "Type": "Int",
+ "Desc": "Size of cache when trained in Scope",
+ "Aliases": [
+ "cache"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000
+ },
+ {
+ "Name": "BatchSize",
+ "Type": "Int",
+ "Desc": "Batch size",
+ "Aliases": [
+ "batch"
+ ],
+ "Required": false,
+ "SortOrder": 190.0,
+ "IsNullable": false,
+ "Default": 1
+ }
],
"Outputs": [
{
@@ -12145,19 +12864,19 @@
}
],
"InputKind": [
- "IUnsupervisedTrainerWithWeight",
+ "ITrainerInputWithLabel",
"ITrainerInput"
],
"OutputKind": [
- "IAnomalyDetectionOutput",
+ "IBinaryClassificationOutput",
"ITrainerOutput"
]
},
{
- "Name": "Trainers.PoissonRegressor",
- "Desc": "Train an Poisson regression model.",
- "FriendlyName": "Poisson Regression",
- "ShortName": "PR",
+ "Name": "Trainers.LogisticRegressionBinaryClassifier",
+ "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.Wikipedia: L-BFGS.Wikipedia: Logistic regression.Scalable Training of L1-Regularized Log-Linear Models.Test Run - L1 and L2 Regularization for Machine Learning.",
+ "FriendlyName": "Logistic Regression",
+ "ShortName": "lr",
"Inputs": [
{
"Name": "TrainingData",
@@ -12246,6 +12965,18 @@
"IsNullable": false,
"Default": "Auto"
},
+ {
+ "Name": "ShowTrainingStats",
+ "Type": "Bool",
+ "Desc": "Show statistics of training examples.",
+ "Aliases": [
+ "stat"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": false
+ },
{
"Name": "L2Weight",
"Type": "Float",
@@ -12449,40 +13180,16 @@
"ITrainerInput"
],
"OutputKind": [
- "IRegressionOutput",
+ "IBinaryClassificationOutput",
"ITrainerOutput"
]
},
{
- "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier",
- "Desc": "Train an SDCA binary model.",
- "FriendlyName": "Fast Linear (SA-SDCA)",
- "ShortName": "SDCA",
+ "Name": "Trainers.LogisticRegressionClassifier",
+ "Desc": "Logistic Regression is a classification method used to predict the value of a categorical dependent variable from its relationship to one or more independent variables assumed to have a logistic distribution. If the dependent variable has only two possible values (success/failure), then the logistic regression is binary. If the dependent variable has more than two possible values (blood type given diagnostic test results), then the logistic regression is multinomial.The optimization technique used for LogisticRegressionBinaryClassifier is the limited memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS). Both the L-BFGS and regular BFGS algorithms use quasi-Newtonian methods to estimate the computationally intensive Hessian matrix in the equation used by Newton's method to calculate steps. But the L-BFGS approximation uses only a limited amount of memory to compute the next step direction, so that it is especially suited for problems with a large number of variables. The memory_size parameter specifies the number of past positions and gradients to store for use in the computation of the next step.This learner can use elastic net regularization: a linear combination of L1 (lasso) and L2 (ridge) regularizations. Regularization is a method that can render an ill-posed problem more tractable by imposing constraints that provide information to supplement the data and that prevents overfitting by penalizing models with extreme coefficient values. This can improve the generalization of the model learned by selecting the optimal complexity in the bias-variance tradeoff. Regularization works by adding the penalty that is associated with coefficient values to the error of the hypothesis. An accurate model with extreme coefficient values would be penalized more, but a less accurate model with more conservative values would be penalized less. L1 and L2 regularization have different effects and uses that are complementary in certain respects.l1_weight: can be applied to sparse models, when working with high-dimensional data. It pulls small weights associated features that are relatively unimportant towards 0. l2_weight: is preferable for data that is not sparse. It pulls large weights towards zero. Adding the ridge penalty to the regularization overcomes some of lasso's limitations. It can improve its predictive accuracy, for example, when the number of predictors is greater than the sample size. If x = l1_weight and y = l2_weight, ax + by = c defines the linear span of the regularization terms. The default values of x and y are both 1. An agressive regularization can harm predictive capacity by excluding important variables out of the model. So choosing the optimal values for the regularization parameters is important for the performance of the logistic regression model.Wikipedia: L-BFGS.Wikipedia: Logistic regression.Scalable Training of L1-Regularized Log-Linear Models.Test Run - L1 and L2 Regularization for Machine Learning.",
+ "FriendlyName": "Multi-class Logistic Regression",
+ "ShortName": "mlr",
"Inputs": [
- {
- "Name": "L2Const",
- "Type": "Float",
- "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.",
- "Aliases": [
- "l2"
- ],
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 1E-07,
- 1E-06,
- 1E-05,
- 0.0001,
- 0.001,
- 0.01
- ]
- }
- },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -12494,29 +13201,6 @@
"SortOrder": 1.0,
"IsNullable": false
},
- {
- "Name": "L1Threshold",
- "Type": "Float",
- "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.",
- "Aliases": [
- "l1"
- ],
- "Required": false,
- "SortOrder": 2.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 0.0,
- 0.25,
- 0.5,
- 0.75,
- 1.0
- ]
- }
- },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -12541,6 +13225,18 @@
"IsNullable": false,
"Default": "Label"
},
+ {
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "Column to use for example weight",
+ "Aliases": [
+ "weight"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": "Weight"
+ },
{
"Name": "NormalizeFeatures",
"Type": {
@@ -12582,138 +13278,181 @@
"Default": "Auto"
},
{
- "Name": "LossFunction",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "SDCAClassificationLossFunction"
- },
- "Desc": "Loss Function",
+ "Name": "ShowTrainingStats",
+ "Type": "Bool",
+ "Desc": "Show statistics of training examples.",
"Aliases": [
- "loss"
+ "stat"
],
"Required": false,
"SortOrder": 50.0,
"IsNullable": false,
- "Default": {
- "Name": "LogLoss"
- }
+ "Default": false
},
{
- "Name": "NumThreads",
- "Type": "Int",
- "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.",
+ "Name": "L2Weight",
+ "Type": "Float",
+ "Desc": "L2 regularization weight",
"Aliases": [
- "nt",
- "t",
- "threads"
+ "l2"
],
"Required": false,
"SortOrder": 50.0,
- "IsNullable": true,
- "Default": null
+ "IsNullable": false,
+ "Default": 1.0,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 4
+ }
},
{
- "Name": "PositiveInstanceWeight",
+ "Name": "L1Weight",
"Type": "Float",
- "Desc": "Apply weight to the positive class, for imbalanced data",
+ "Desc": "L1 regularization weight",
"Aliases": [
- "piw"
+ "l1"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": 1.0
+ "Default": 1.0,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 4
+ }
},
{
- "Name": "Calibrator",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "CalibratorTrainer"
- },
- "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
+ "Name": "OptTol",
+ "Type": "Float",
+ "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate",
+ "Aliases": [
+ "ot"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": {
- "Name": "PlattCalibrator"
+ "Default": 1E-07,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0001,
+ 1E-07
+ ]
}
},
{
- "Name": "MaxCalibrationExamples",
+ "Name": "MemorySize",
"Type": "Int",
- "Desc": "The maximum number of examples to use when training the calibrator",
+ "Desc": "Memory size for L-BFGS. Lower=faster, less accurate",
+ "Aliases": [
+ "m"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": 1000000
+ "Default": 20,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 5,
+ 20,
+ 50
+ ]
+ }
},
{
- "Name": "ConvergenceTolerance",
+ "Name": "EnforceNonNegativity",
+ "Type": "Bool",
+ "Desc": "Enforce non-negative weights",
+ "Aliases": [
+ "nn"
+ ],
+ "Required": false,
+ "SortOrder": 90.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "InitWtsDiameter",
"Type": "Float",
- "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.",
+ "Desc": "Init weights diameter",
"Aliases": [
- "tol"
+ "initwts"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 140.0,
"IsNullable": false,
- "Default": 0.1,
+ "Default": 0.0,
"SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 0.001,
- 0.01,
- 0.1,
- 0.2
- ]
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 5
}
},
{
"Name": "MaxIterations",
"Type": "Int",
- "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.",
+ "Desc": "Maximum iterations.",
"Aliases": [
- "iter"
+ "maxiter"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
- "Default": null,
+ "IsNullable": false,
+ "Default": 2147483647,
"SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 10,
- 20,
- 100
- ]
+ "RangeType": "Long",
+ "Min": 1,
+ "Max": 2147483647
}
},
{
- "Name": "Shuffle",
+ "Name": "SgdInitializationTolerance",
+ "Type": "Float",
+ "Desc": "Run SGD to initialize LR weights, converging to this tolerance",
+ "Aliases": [
+ "sgd"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0
+ },
+ {
+ "Name": "Quiet",
"Type": "Bool",
- "Desc": "Shuffle data every epoch?",
+ "Desc": "If set to true, produce no output during training.",
"Aliases": [
- "shuf"
+ "q"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- false,
- true
- ]
- }
+ "Default": false
},
{
- "Name": "CheckFrequency",
+ "Name": "UseThreads",
+ "Type": "Bool",
+ "Desc": "Whether or not to use threads. Default is true",
+ "Aliases": [
+ "t"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "NumThreads",
"Type": "Int",
- "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.",
+ "Desc": "Number of threads",
"Aliases": [
- "checkFreq"
+ "nt"
],
"Required": false,
"SortOrder": 150.0,
@@ -12721,23 +13460,21 @@
"Default": null
},
{
- "Name": "BiasLearningRate",
- "Type": "Float",
- "Desc": "The learning rate for adjusting bias from being regularized.",
+ "Name": "DenseOptimizer",
+ "Type": "Bool",
+ "Desc": "Force densification of the internal optimization vectors",
"Aliases": [
- "blr"
+ "do"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0,
+ "Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 0.0,
- 0.01,
- 0.1,
- 1.0
+ false,
+ true
]
}
}
@@ -12750,44 +13487,21 @@
}
],
"InputKind": [
+ "ITrainerInputWithWeight",
"ITrainerInputWithLabel",
"ITrainerInput"
],
"OutputKind": [
- "IBinaryClassificationOutput",
+ "IMulticlassClassificationOutput",
"ITrainerOutput"
]
},
{
- "Name": "Trainers.StochasticDualCoordinateAscentClassifier",
- "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:Scaling Up Stochastic Dual Coordinate Ascent.Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.",
- "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)",
- "ShortName": "sasdcamc",
+ "Name": "Trainers.NaiveBayesClassifier",
+ "Desc": "Train a MultiClassNaiveBayesTrainer.",
+ "FriendlyName": "Multiclass Naive Bayes",
+ "ShortName": "MNB",
"Inputs": [
- {
- "Name": "L2Const",
- "Type": "Float",
- "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.",
- "Aliases": [
- "l2"
- ],
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 1E-07,
- 1E-06,
- 1E-05,
- 0.0001,
- 0.001,
- 0.01
- ]
- }
- },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -12799,29 +13513,6 @@
"SortOrder": 1.0,
"IsNullable": false
},
- {
- "Name": "L1Threshold",
- "Type": "Float",
- "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.",
- "Aliases": [
- "l1"
- ],
- "Required": false,
- "SortOrder": 2.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 0.0,
- 0.25,
- 0.5,
- 0.75,
- 1.0
- ]
- }
- },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -12885,134 +13576,9 @@
"SortOrder": 6.0,
"IsNullable": false,
"Default": "Auto"
- },
- {
- "Name": "LossFunction",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "SDCAClassificationLossFunction"
- },
- "Desc": "Loss Function",
- "Aliases": [
- "loss"
- ],
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": {
- "Name": "LogLoss"
- }
- },
- {
- "Name": "NumThreads",
- "Type": "Int",
- "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.",
- "Aliases": [
- "nt",
- "t",
- "threads"
- ],
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "ConvergenceTolerance",
- "Type": "Float",
- "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.",
- "Aliases": [
- "tol"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.1,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 0.001,
- 0.01,
- 0.1,
- 0.2
- ]
- }
- },
- {
- "Name": "MaxIterations",
- "Type": "Int",
- "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.",
- "Aliases": [
- "iter"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 10,
- 20,
- 100
- ]
- }
- },
- {
- "Name": "Shuffle",
- "Type": "Bool",
- "Desc": "Shuffle data every epoch?",
- "Aliases": [
- "shuf"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": true,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- false,
- true
- ]
- }
- },
- {
- "Name": "CheckFrequency",
- "Type": "Int",
- "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.",
- "Aliases": [
- "checkFreq"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "BiasLearningRate",
- "Type": "Float",
- "Desc": "The learning rate for adjusting bias from being regularized.",
- "Aliases": [
- "blr"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.0,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 0.0,
- 0.01,
- 0.1,
- 1.0
- ]
- }
- }
- ],
- "Outputs": [
+ }
+ ],
+ "Outputs": [
{
"Name": "PredictorModel",
"Type": "PredictorModel",
@@ -13029,35 +13595,11 @@
]
},
{
- "Name": "Trainers.StochasticDualCoordinateAscentRegressor",
- "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:Scaling Up Stochastic Dual Coordinate Ascent.Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.",
- "FriendlyName": "Fast Linear Regression (SA-SDCA)",
- "ShortName": "sasdcar",
+ "Name": "Trainers.OnlineGradientDescentRegressor",
+ "Desc": "Train a Online gradient descent perceptron.",
+ "FriendlyName": "Stochastic Gradient Descent (Regression)",
+ "ShortName": "ogd",
"Inputs": [
- {
- "Name": "L2Const",
- "Type": "Float",
- "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.",
- "Aliases": [
- "l2"
- ],
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 1E-07,
- 1E-06,
- 1E-05,
- 0.0001,
- 0.001,
- 0.01
- ]
- }
- },
{
"Name": "TrainingData",
"Type": "DataView",
@@ -13069,29 +13611,6 @@
"SortOrder": 1.0,
"IsNullable": false
},
- {
- "Name": "L1Threshold",
- "Type": "Float",
- "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.",
- "Aliases": [
- "l1"
- ],
- "Required": false,
- "SortOrder": 2.0,
- "IsNullable": true,
- "Default": null,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- "",
- 0.0,
- 0.25,
- 0.5,
- 0.75,
- 1.0
- ]
- }
- },
{
"Name": "FeatureColumn",
"Type": "String",
@@ -13160,7 +13679,7 @@
"Name": "LossFunction",
"Type": {
"Kind": "Component",
- "ComponentKind": "SDCARegressionLossFunction"
+ "ComponentKind": "RegressionLossFunction"
},
"Desc": "Loss Function",
"Aliases": [
@@ -13174,112 +13693,213 @@
}
},
{
- "Name": "NumThreads",
- "Type": "Int",
- "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.",
- "Aliases": [
- "nt",
- "t",
- "threads"
- ],
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "ConvergenceTolerance",
+ "Name": "LearningRate",
"Type": "Float",
- "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.",
+ "Desc": "Learning rate",
"Aliases": [
- "tol"
+ "lr"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": 0.01,
+ "Default": 0.1,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 0.001,
0.01,
0.1,
- 0.2
+ 0.5,
+ 1.0
]
}
},
{
- "Name": "MaxIterations",
- "Type": "Int",
- "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.",
+ "Name": "DecreaseLearningRate",
+ "Type": "Bool",
+ "Desc": "Decrease learning rate",
"Aliases": [
- "iter"
+ "decreaselr"
],
"Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": true,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- "",
- 10,
- 20,
- 100
+ false,
+ true
]
}
},
{
- "Name": "Shuffle",
- "Type": "Bool",
- "Desc": "Shuffle data every epoch?",
+ "Name": "L2RegularizerWeight",
+ "Type": "Float",
+ "Desc": "L2 Regularization Weight",
"Aliases": [
- "shuf"
+ "reg"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": true,
+ "Default": 0.0,
"SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- false,
- true
- ]
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 0.5
}
},
{
- "Name": "CheckFrequency",
+ "Name": "NumIterations",
"Type": "Int",
- "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.",
+ "Desc": "Number of iterations",
"Aliases": [
- "checkFreq"
+ "iter"
],
"Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 1,
+ "Max": 100,
+ "StepSize": 10.0,
+ "IsLogScale": true
+ }
},
{
- "Name": "BiasLearningRate",
+ "Name": "InitWtsDiameter",
"Type": "Float",
- "Desc": "The learning rate for adjusting bias from being regularized.",
+ "Desc": "Init weights diameter",
"Aliases": [
- "blr"
+ "initwts"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 140.0,
"IsNullable": false,
- "Default": 1.0,
+ "Default": 0.0,
"SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 0.0,
- 0.01,
- 0.1,
- 1.0
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 5
+ }
+ },
+ {
+ "Name": "ResetWeightsAfterXExamples",
+ "Type": "Int",
+ "Desc": "Number of examples after which weights will be reset to the current average",
+ "Aliases": [
+ "numreset"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "DoLazyUpdates",
+ "Type": "Bool",
+ "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero",
+ "Aliases": [
+ "lazy"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "RecencyGain",
+ "Type": "Float",
+ "Desc": "Extra weight given to more recent updates",
+ "Aliases": [
+ "rg"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0
+ },
+ {
+ "Name": "RecencyGainMulti",
+ "Type": "Bool",
+ "Desc": "Whether Recency Gain is multiplicative (vs. additive)",
+ "Aliases": [
+ "rgm"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "Averaged",
+ "Type": "Bool",
+ "Desc": "Do averaging?",
+ "Aliases": [
+ "avg"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "AveragedTolerance",
+ "Type": "Float",
+ "Desc": "The inexactness tolerance for averaging",
+ "Aliases": [
+ "avgtol"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.01
+ },
+ {
+ "Name": "InitialWeights",
+ "Type": "String",
+ "Desc": "Initial Weights and bias, comma-separated",
+ "Aliases": [
+ "initweights"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Shuffle",
+ "Type": "Bool",
+ "Desc": "Whether to shuffle for each training iteration",
+ "Aliases": [
+ "shuf"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
]
}
+ },
+ {
+ "Name": "StreamingCacheSize",
+ "Type": "Int",
+ "Desc": "Size of cache when trained in Scope",
+ "Aliases": [
+ "cache"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000
}
],
"Outputs": [
@@ -13299,10 +13919,10 @@
]
},
{
- "Name": "Trainers.StochasticGradientDescentBinaryClassifier",
- "Desc": "Train an Hogwild SGD binary model.",
- "FriendlyName": "Hogwild SGD (binary)",
- "ShortName": "HogwildSGD",
+ "Name": "Trainers.PcaAnomalyDetector",
+ "Desc": "Train an PCA Anomaly model.",
+ "FriendlyName": "PCA Anomaly Detector",
+ "ShortName": "pcaAnom",
"Inputs": [
{
"Name": "TrainingData",
@@ -13327,18 +13947,6 @@
"IsNullable": false,
"Default": "Features"
},
- {
- "Name": "LabelColumn",
- "Type": "String",
- "Desc": "Column to use for labels",
- "Aliases": [
- "lab"
- ],
- "Required": false,
- "SortOrder": 3.0,
- "IsNullable": false,
- "Default": "Label"
- },
{
"Name": "WeightColumn",
"Type": "String",
@@ -13392,119 +14000,49 @@
"Default": "Auto"
},
{
- "Name": "LossFunction",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "ClassificationLossFunction"
- },
- "Desc": "Loss Function",
- "Aliases": [
- "loss"
- ],
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": {
- "Name": "LogLoss"
- }
- },
- {
- "Name": "L2Const",
- "Type": "Float",
- "Desc": "L2 regularizer constant",
- "Aliases": [
- "l2"
- ],
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 1E-06,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 1E-07,
- 5E-07,
- 1E-06,
- 5E-06,
- 1E-05
- ]
- }
- },
- {
- "Name": "NumThreads",
+ "Name": "Rank",
"Type": "Int",
- "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.",
+ "Desc": "The number of components in the PCA",
"Aliases": [
- "nt",
- "t",
- "threads"
+ "k"
],
"Required": false,
"SortOrder": 50.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "ConvergenceTolerance",
- "Type": "Float",
- "Desc": "Exponential moving averaged improvement tolerance for convergence",
- "Aliases": [
- "tol"
- ],
- "Required": false,
- "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0.0001,
+ "Default": 20,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 0.01,
- 0.001,
- 0.0001,
- 1E-05
+ 10,
+ 20,
+ 40,
+ 80
]
}
},
{
- "Name": "MaxIterations",
+ "Name": "Oversampling",
"Type": "Int",
- "Desc": "Maximum number of iterations; set to 1 to simulate online learning.",
- "Aliases": [
- "iter"
- ],
+ "Desc": "Oversampling parameter for randomized PCA training",
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
"Default": 20,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
- 1,
- 5,
10,
- 20
+ 20,
+ 40
]
}
},
{
- "Name": "InitLearningRate",
- "Type": "Float",
- "Desc": "Initial learning rate (only used by SGD)",
- "Aliases": [
- "ilr",
- "lr"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.01
- },
- {
- "Name": "Shuffle",
+ "Name": "Center",
"Type": "Bool",
- "Desc": "Shuffle data every epoch?",
+ "Desc": "If enabled, data is centered to be zero mean",
"Aliases": [
- "shuf"
+ "center"
],
"Required": false,
"SortOrder": 150.0,
@@ -13519,51 +14057,16 @@
}
},
{
- "Name": "PositiveInstanceWeight",
- "Type": "Float",
- "Desc": "Apply weight to the positive class, for imbalanced data",
+ "Name": "Seed",
+ "Type": "Int",
+ "Desc": "The seed for random number generation",
"Aliases": [
- "piw"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 1.0
- },
- {
- "Name": "CheckFrequency",
- "Type": "Int",
- "Desc": "Convergence check frequency (in terms of number of iterations). Default equals number of threads",
- "Aliases": [
- "checkFreq"
+ "seed"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
- },
- {
- "Name": "Calibrator",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "CalibratorTrainer"
- },
- "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": {
- "Name": "PlattCalibrator"
- }
- },
- {
- "Name": "MaxCalibrationExamples",
- "Type": "Int",
- "Desc": "The maximum number of examples to use when training the calibrator",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 1000000
}
],
"Outputs": [
@@ -13574,247 +14077,259 @@
}
],
"InputKind": [
- "ITrainerInputWithWeight",
- "ITrainerInputWithLabel",
+ "IUnsupervisedTrainerWithWeight",
"ITrainerInput"
],
"OutputKind": [
- "IBinaryClassificationOutput",
+ "IAnomalyDetectionOutput",
"ITrainerOutput"
]
},
{
- "Name": "Transforms.ApproximateBootstrapSampler",
- "Desc": "Approximate bootstrap sampling.",
- "FriendlyName": "Bootstrap Sample Transform",
- "ShortName": "BootstrapSample",
+ "Name": "Trainers.PoissonRegressor",
+ "Desc": "Train an Poisson regression model.",
+ "FriendlyName": "Poisson Regression",
+ "ShortName": "PR",
"Inputs": [
{
- "Name": "Data",
+ "Name": "TrainingData",
"Type": "DataView",
- "Desc": "Input dataset",
+ "Desc": "The data to be used for training",
+ "Aliases": [
+ "data"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "Complement",
- "Type": "Bool",
- "Desc": "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.",
+ "Name": "FeatureColumn",
+ "Type": "String",
+ "Desc": "Column to use for features",
"Aliases": [
- "comp"
+ "feat"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 2.0,
"IsNullable": false,
- "Default": false
+ "Default": "Features"
},
{
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "The random seed. If unspecified random state will be instead derived from the environment.",
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
"Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "Label"
},
{
- "Name": "ShuffleInput",
- "Type": "Bool",
- "Desc": "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.",
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "Column to use for example weight",
"Aliases": [
- "si"
+ "weight"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 4.0,
"IsNullable": false,
- "Default": true
+ "Default": "Weight"
},
{
- "Name": "PoolSize",
- "Type": "Int",
- "Desc": "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.",
+ "Name": "NormalizeFeatures",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
+ ]
+ },
+ "Desc": "Normalize option for the feature column",
"Aliases": [
- "pool"
+ "norm"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 5.0,
"IsNullable": false,
- "Default": 1000
- }
- ],
- "Outputs": [
+ "Default": "Auto"
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "Caching",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Auto",
+ "Memory",
+ "Disk",
+ "None"
+ ]
+ },
+ "Desc": "Whether learner should cache input training data",
+ "Aliases": [
+ "cache"
+ ],
+ "Required": false,
+ "SortOrder": 6.0,
+ "IsNullable": false,
+ "Default": "Auto"
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.BinaryPredictionScoreColumnsRenamer",
- "Desc": "For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.",
- "FriendlyName": "Rename Binary Prediction Score Columns",
- "ShortName": null,
- "Inputs": [
+ "Name": "L2Weight",
+ "Type": "Float",
+ "Desc": "L2 regularization weight",
+ "Aliases": [
+ "l2"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 4
+ }
+ },
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "L1Weight",
+ "Type": "Float",
+ "Desc": "L1 regularization weight",
+ "Aliases": [
+ "l1"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 4
+ }
},
{
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "The predictor model used in scoring",
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
+ "Name": "OptTol",
+ "Type": "Float",
+ "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate",
+ "Aliases": [
+ "ot"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1E-07,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0001,
+ 1E-07
+ ]
+ }
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "MemorySize",
+ "Type": "Int",
+ "Desc": "Memory size for L-BFGS. Lower=faster, less accurate",
+ "Aliases": [
+ "m"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 20,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 5,
+ 20,
+ 50
+ ]
+ }
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.BinNormalizer",
- "Desc": "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.",
- "FriendlyName": "Binning Normalizer",
- "ShortName": "Bin",
- "Inputs": [
- {
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "NumBins",
- "Type": "Int",
- "Desc": "Max number of bins, power of 2 recommended",
- "Aliases": [
- "bins"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "FixZero",
- "Type": "Bool",
- "Desc": "Whether to map zero to zero, preserving sparsity",
- "Aliases": [
- "zero"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "MaxTrainingExamples",
- "Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
- "Aliases": [
- "maxtrain"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Name": "EnforceNonNegativity",
+ "Type": "Bool",
+ "Desc": "Enforce non-negative weights",
"Aliases": [
- "col"
+ "nn"
],
"Required": false,
- "SortOrder": 1.0,
+ "SortOrder": 90.0,
"IsNullable": false,
- "Default": null
+ "Default": false
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "InitWtsDiameter",
+ "Type": "Float",
+ "Desc": "Init weights diameter",
+ "Aliases": [
+ "initwts"
+ ],
+ "Required": false,
+ "SortOrder": 140.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "SweepRange": {
+ "RangeType": "Float",
+ "Min": 0.0,
+ "Max": 1.0,
+ "NumSteps": 5
+ }
},
{
- "Name": "NumBins",
+ "Name": "MaxIterations",
"Type": "Int",
- "Desc": "Max number of bins, power of 2 recommended",
+ "Desc": "Maximum iterations.",
"Aliases": [
- "bins"
+ "maxiter"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1024
+ "Default": 2147483647,
+ "SweepRange": {
+ "RangeType": "Long",
+ "Min": 1,
+ "Max": 2147483647
+ }
},
{
- "Name": "FixZero",
+ "Name": "SgdInitializationTolerance",
+ "Type": "Float",
+ "Desc": "Run SGD to initialize LR weights, converging to this tolerance",
+ "Aliases": [
+ "sgd"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0
+ },
+ {
+ "Name": "Quiet",
"Type": "Bool",
- "Desc": "Whether to map zero to zero, preserving sparsity",
+ "Desc": "If set to true, produce no output during training.",
"Aliases": [
- "zero"
+ "q"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "UseThreads",
+ "Type": "Bool",
+ "Desc": "Whether or not to use threads. Default is true",
+ "Aliases": [
+ "t"
],
"Required": false,
"SortOrder": 150.0,
@@ -13822,1471 +14337,1189 @@
"Default": true
},
{
- "Name": "MaxTrainingExamples",
+ "Name": "NumThreads",
"Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
+ "Desc": "Number of threads",
"Aliases": [
- "maxtrain"
+ "nt"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "DenseOptimizer",
+ "Type": "Bool",
+ "Desc": "Force densification of the internal optimization vectors",
+ "Aliases": [
+ "do"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1000000000
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
}
],
"Outputs": [
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The trained model"
}
],
"InputKind": [
- "ITransformInput"
+ "ITrainerInputWithWeight",
+ "ITrainerInputWithLabel",
+ "ITrainerInput"
],
"OutputKind": [
- "ITransformOutput"
+ "IRegressionOutput",
+ "ITrainerOutput"
]
},
{
- "Name": "Transforms.CategoricalHashOneHotVectorizer",
- "Desc": "Encodes the categorical variable with hash-based encoding",
- "FriendlyName": "Categorical Hash Transform",
- "ShortName": null,
+ "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier",
+ "Desc": "Train an SDCA binary model.",
+ "FriendlyName": "Fast Linear (SA-SDCA)",
+ "ShortName": "SDCA",
"Inputs": [
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "OutputKind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Bag",
- "Ind",
- "Key",
- "Bin"
- ]
- },
- "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)",
- "Aliases": [
- "kind"
- ],
- "Required": false,
- "SortOrder": 102.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "HashBits",
- "Type": "Int",
- "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.",
- "Aliases": [
- "bits"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "Hashing seed",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Ordered",
- "Type": "Bool",
- "Desc": "Whether the position of each term should be included in the hash",
- "Aliases": [
- "ord"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "InvertHash",
- "Type": "Int",
- "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.",
- "Aliases": [
- "ih"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:hashBits:src)",
+ "Name": "L2Const",
+ "Type": "Float",
+ "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.",
"Aliases": [
- "col"
+ "l2"
],
- "Required": true,
+ "Required": false,
"SortOrder": 1.0,
- "IsNullable": false
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 1E-07,
+ 1E-06,
+ 1E-05,
+ 0.0001,
+ 0.001,
+ 0.01
+ ]
+ }
},
{
- "Name": "Data",
+ "Name": "TrainingData",
"Type": "DataView",
- "Desc": "Input dataset",
+ "Desc": "The data to be used for training",
+ "Aliases": [
+ "data"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "HashBits",
- "Type": "Int",
- "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.",
+ "Name": "L1Threshold",
+ "Type": "Float",
+ "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.",
"Aliases": [
- "bits"
+ "l1"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 0.0,
+ 0.25,
+ 0.5,
+ 0.75,
+ 1.0
+ ]
+ }
+ },
+ {
+ "Name": "FeatureColumn",
+ "Type": "String",
+ "Desc": "Column to use for features",
+ "Aliases": [
+ "feat"
],
"Required": false,
"SortOrder": 2.0,
"IsNullable": false,
- "Default": 16
+ "Default": "Features"
},
{
- "Name": "OutputKind",
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "Label"
+ },
+ {
+ "Name": "NormalizeFeatures",
"Type": {
"Kind": "Enum",
"Values": [
- "Bag",
- "Ind",
- "Key",
- "Bin"
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
]
},
- "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)",
+ "Desc": "Normalize option for the feature column",
"Aliases": [
- "kind"
+ "norm"
],
"Required": false,
- "SortOrder": 102.0,
+ "SortOrder": 5.0,
"IsNullable": false,
- "Default": "Bag"
+ "Default": "Auto"
},
{
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "Hashing seed",
+ "Name": "Caching",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Auto",
+ "Memory",
+ "Disk",
+ "None"
+ ]
+ },
+ "Desc": "Whether learner should cache input training data",
+ "Aliases": [
+ "cache"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 6.0,
"IsNullable": false,
- "Default": 314489979
+ "Default": "Auto"
},
{
- "Name": "Ordered",
- "Type": "Bool",
- "Desc": "Whether the position of each term should be included in the hash",
+ "Name": "LossFunction",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "SDCAClassificationLossFunction"
+ },
+ "Desc": "Loss Function",
"Aliases": [
- "ord"
+ "loss"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": true
+ "Default": {
+ "Name": "LogLoss"
+ }
},
{
- "Name": "InvertHash",
+ "Name": "NumThreads",
"Type": "Int",
- "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.",
+ "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.",
"Aliases": [
- "ih"
+ "nt",
+ "t",
+ "threads"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "PositiveInstanceWeight",
+ "Type": "Float",
+ "Desc": "Apply weight to the positive class, for imbalanced data",
+ "Aliases": [
+ "piw"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 0
- }
- ],
- "Outputs": [
+ "Default": 1.0
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "Calibrator",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "CalibratorTrainer"
+ },
+ "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "PlattCalibrator"
+ }
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.CategoricalOneHotVectorizer",
- "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary",
- "FriendlyName": "Categorical Transform",
- "ShortName": null,
- "Inputs": [
+ "Name": "MaxCalibrationExamples",
+ "Type": "Int",
+ "Desc": "The maximum number of examples to use when training the calibrator",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000
+ },
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "OutputKind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Bag",
- "Ind",
- "Key",
- "Bin"
- ]
- },
- "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector",
- "Aliases": [
- "kind"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "MaxNumTerms",
- "Type": "Int",
- "Desc": "Maximum number of terms to keep when auto-training",
- "Aliases": [
- "max"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Term",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "List of terms",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Sort",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
- },
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "TextKeyValues",
- "Type": "Bool",
- "Desc": "Whether key value metadata should be text, regardless of the actual input type",
- "Aliases": [
- "textkv"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Name": "ConvergenceTolerance",
+ "Type": "Float",
+ "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.",
"Aliases": [
- "col"
+ "tol"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.1,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.001,
+ 0.01,
+ 0.1,
+ 0.2
+ ]
+ }
},
{
- "Name": "MaxNumTerms",
+ "Name": "MaxIterations",
"Type": "Int",
- "Desc": "Maximum number of terms to keep per column when auto-training",
+ "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.",
"Aliases": [
- "max"
+ "iter"
],
"Required": false,
- "SortOrder": 5.0,
- "IsNullable": false,
- "Default": 1000000
- },
- {
- "Name": "OutputKind",
- "Type": {
- "Kind": "Enum",
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
"Values": [
- "Bag",
- "Ind",
- "Key",
- "Bin"
+ "",
+ 10,
+ 20,
+ 100
]
- },
- "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)",
+ }
+ },
+ {
+ "Name": "Shuffle",
+ "Type": "Bool",
+ "Desc": "Shuffle data every epoch?",
"Aliases": [
- "kind"
+ "shuf"
],
"Required": false,
- "SortOrder": 102.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": "Ind"
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
},
{
- "Name": "Term",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "List of terms",
+ "Name": "CheckFrequency",
+ "Type": "Int",
+ "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.",
+ "Aliases": [
+ "checkFreq"
+ ],
"Required": false,
- "SortOrder": 106.0,
- "IsNullable": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
"Default": null
},
{
- "Name": "Sort",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
- },
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
- "Required": false,
- "SortOrder": 113.0,
- "IsNullable": false,
- "Default": "Occurrence"
- },
- {
- "Name": "TextKeyValues",
- "Type": "Bool",
- "Desc": "Whether key value metadata should be text, regardless of the actual input type",
+ "Name": "BiasLearningRate",
+ "Type": "Float",
+ "Desc": "The learning rate for adjusting bias from being regularized.",
"Aliases": [
- "textkv"
+ "blr"
],
"Required": false,
- "SortOrder": 114.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": true
+ "Default": 0.0,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.01,
+ 0.1,
+ 1.0
+ ]
+ }
}
],
"Outputs": [
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The trained model"
}
],
"InputKind": [
- "ITransformInput"
+ "ITrainerInputWithLabel",
+ "ITrainerInput"
],
"OutputKind": [
- "ITransformOutput"
+ "IBinaryClassificationOutput",
+ "ITrainerOutput"
]
},
{
- "Name": "Transforms.CharacterTokenizer",
- "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.",
- "FriendlyName": "Character Tokenizer Transform",
- "ShortName": "CharToken",
+ "Name": "Trainers.StochasticDualCoordinateAscentClassifier",
+ "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:Scaling Up Stochastic Dual Coordinate Ascent.Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.",
+ "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)",
+ "ShortName": "sasdcamc",
"Inputs": [
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Name": "L2Const",
+ "Type": "Float",
+ "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.",
"Aliases": [
- "col"
+ "l2"
],
- "Required": true,
+ "Required": false,
"SortOrder": 1.0,
- "IsNullable": false
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 1E-07,
+ 1E-06,
+ 1E-05,
+ 0.0001,
+ 0.001,
+ 0.01
+ ]
+ }
},
{
- "Name": "Data",
+ "Name": "TrainingData",
"Type": "DataView",
- "Desc": "Input dataset",
+ "Desc": "The data to be used for training",
+ "Aliases": [
+ "data"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "UseMarkerChars",
- "Type": "Bool",
- "Desc": "Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03)",
+ "Name": "L1Threshold",
+ "Type": "Float",
+ "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.",
"Aliases": [
- "mark"
+ "l1"
],
"Required": false,
"SortOrder": 2.0,
- "IsNullable": false,
- "Default": true
- }
- ],
- "Outputs": [
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 0.0,
+ 0.25,
+ 0.5,
+ 0.75,
+ 1.0
+ ]
+ }
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "FeatureColumn",
+ "Type": "String",
+ "Desc": "Column to use for features",
+ "Aliases": [
+ "feat"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": "Features"
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.ColumnConcatenator",
- "Desc": "Concatenates two columns of the same item type.",
- "FriendlyName": "Concat Transform",
- "ShortName": "Concat",
- "Inputs": [
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "Label"
+ },
{
- "Name": "Column",
+ "Name": "NormalizeFeatures",
"Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
+ "Kind": "Enum",
+ "Values": [
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
+ ]
},
- "Desc": "New column definition(s) (optional form: name:srcs)",
+ "Desc": "Normalize option for the feature column",
"Aliases": [
- "col"
+ "norm"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "Auto"
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.ColumnCopier",
- "Desc": "Duplicates columns from the dataset",
- "FriendlyName": "Copy Columns Transform",
- "ShortName": "Copy",
- "Inputs": [
- {
- "Name": "Column",
+ "Name": "Caching",
"Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
+ "Kind": "Enum",
+ "Values": [
+ "Auto",
+ "Memory",
+ "Disk",
+ "None"
+ ]
},
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Desc": "Whether learner should cache input training data",
"Aliases": [
- "col"
+ "cache"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Required": false,
+ "SortOrder": 6.0,
+ "IsNullable": false,
+ "Default": "Auto"
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
+ "Name": "LossFunction",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "SDCAClassificationLossFunction"
+ },
+ "Desc": "Loss Function",
+ "Aliases": [
+ "loss"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "LogLoss"
+ }
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "NumThreads",
+ "Type": "Int",
+ "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.",
+ "Aliases": [
+ "nt",
+ "t",
+ "threads"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.ColumnDropper",
- "Desc": "Drops columns from the dataset",
- "FriendlyName": "Drop Columns Transform",
- "ShortName": "Drop",
- "Inputs": [
+ "Name": "ConvergenceTolerance",
+ "Type": "Float",
+ "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.",
+ "Aliases": [
+ "tol"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.1,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.001,
+ 0.01,
+ 0.1,
+ 0.2
+ ]
+ }
+ },
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Column name to drop",
+ "Name": "MaxIterations",
+ "Type": "Int",
+ "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.",
"Aliases": [
- "col"
+ "iter"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 10,
+ 20,
+ 100
+ ]
+ }
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "Shuffle",
+ "Type": "Bool",
+ "Desc": "Shuffle data every epoch?",
+ "Aliases": [
+ "shuf"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
+ },
+ {
+ "Name": "CheckFrequency",
+ "Type": "Int",
+ "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.",
+ "Aliases": [
+ "checkFreq"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "BiasLearningRate",
+ "Type": "Float",
+ "Desc": "The learning rate for adjusting bias from being regularized.",
+ "Aliases": [
+ "blr"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.01,
+ 0.1,
+ 1.0
+ ]
+ }
}
],
"Outputs": [
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The trained model"
}
],
"InputKind": [
- "ITransformInput"
+ "ITrainerInputWithLabel",
+ "ITrainerInput"
],
"OutputKind": [
- "ITransformOutput"
+ "IMulticlassClassificationOutput",
+ "ITrainerOutput"
]
},
{
- "Name": "Transforms.ColumnSelector",
- "Desc": "Selects a set of columns, dropping all others",
- "FriendlyName": "Select Columns",
- "ShortName": null,
+ "Name": "Trainers.StochasticDualCoordinateAscentRegressor",
+ "Desc": "This classifier is a trainer based on the Stochastic DualCoordinate Ascent(SDCA) method, a state-of-the-art optimization technique for convex objective functions.The algorithm can be scaled for use on large out-of-memory data sets due to a semi-asynchronized implementation that supports multi-threading.Convergence is underwritten by periodically enforcing synchronization between primal and dual updates in a separate thread.Several choices of loss functions are also provided.The SDCA method combines several of the best properties and capabilities of logistic regression and SVM algorithms.For more information on SDCA, see:Scaling Up Stochastic Dual Coordinate Ascent.Stochastic Dual Coordinate Ascent Methods for Regularized Loss Minimization.Note that SDCA is a stochastic and streaming optimization algorithm. The results depends on the order of the training data. For reproducible results, it is recommended that one sets `shuffle` to`False` and `NumThreads` to `1`.Elastic net regularization can be specified by the l2_weight and l1_weight parameters. Note that the l2_weight has an effect on the rate of convergence. In general, the larger the l2_weight, the faster SDCA converges.",
+ "FriendlyName": "Fast Linear Regression (SA-SDCA)",
+ "ShortName": "sasdcar",
"Inputs": [
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Column name to keep",
+ "Name": "L2Const",
+ "Type": "Float",
+ "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.",
"Aliases": [
- "col"
+ "l2"
],
"Required": false,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 1E-07,
+ 1E-06,
+ 1E-05,
+ 0.0001,
+ 0.001,
+ 0.01
+ ]
+ }
},
{
- "Name": "Data",
+ "Name": "TrainingData",
"Type": "DataView",
- "Desc": "Input dataset",
+ "Desc": "The data to be used for training",
+ "Aliases": [
+ "data"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
- }
- ],
- "Outputs": [
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "L1Threshold",
+ "Type": "Float",
+ "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.",
+ "Aliases": [
+ "l1"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 0.0,
+ 0.25,
+ 0.5,
+ 0.75,
+ 1.0
+ ]
+ }
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.ColumnTypeConverter",
- "Desc": "Converts a column to a different type, using standard conversions.",
- "FriendlyName": "Convert Transform",
- "ShortName": "Convert",
- "Inputs": [
- {
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "ResultType",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "I1",
- "U1",
- "I2",
- "U2",
- "I4",
- "U4",
- "I8",
- "U8",
- "R4",
- "Num",
- "R8",
- "TX",
- "Text",
- "TXT",
- "BL",
- "Bool",
- "TimeSpan",
- "TS",
- "DT",
- "DateTime",
- "DZ",
- "DateTimeZone",
- "UG",
- "U16"
- ]
- },
- "Desc": "The result type",
- "Aliases": [
- "type"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Range",
- "Type": "String",
- "Desc": "For a key column, this defines the range of values",
- "Aliases": [
- "key"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:type:src)",
+ "Name": "FeatureColumn",
+ "Type": "String",
+ "Desc": "Column to use for features",
"Aliases": [
- "col"
+ "feat"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": "Features"
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "Label"
},
{
- "Name": "ResultType",
+ "Name": "NormalizeFeatures",
"Type": {
"Kind": "Enum",
"Values": [
- "I1",
- "U1",
- "I2",
- "U2",
- "I4",
- "U4",
- "I8",
- "U8",
- "R4",
- "Num",
- "R8",
- "TX",
- "Text",
- "TXT",
- "BL",
- "Bool",
- "TimeSpan",
- "TS",
- "DT",
- "DateTime",
- "DZ",
- "DateTimeZone",
- "UG",
- "U16"
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
]
},
- "Desc": "The result type",
+ "Desc": "Normalize option for the feature column",
"Aliases": [
- "type"
+ "norm"
],
"Required": false,
- "SortOrder": 2.0,
- "IsNullable": true,
- "Default": null
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "Auto"
},
{
- "Name": "Range",
- "Type": "String",
- "Desc": "For a key column, this defines the range of values",
+ "Name": "Caching",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Auto",
+ "Memory",
+ "Disk",
+ "None"
+ ]
+ },
+ "Desc": "Whether learner should cache input training data",
"Aliases": [
- "key"
+ "cache"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 6.0,
"IsNullable": false,
- "Default": null
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Default": "Auto"
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.CombinerByContiguousGroupId",
- "Desc": "Groups values of a scalar column into a vector, by a contiguous group ID",
- "FriendlyName": "Group Transform",
- "ShortName": "Group",
- "Inputs": [
- {
- "Name": "GroupKey",
+ "Name": "LossFunction",
"Type": {
- "Kind": "Array",
- "ItemType": "String"
+ "Kind": "Component",
+ "ComponentKind": "SDCARegressionLossFunction"
},
- "Desc": "Columns to group by",
+ "Desc": "Loss Function",
"Aliases": [
- "g"
+ "loss"
],
"Required": false,
- "SortOrder": 1.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": null
+ "Default": {
+ "Name": "SquaredLoss"
+ }
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "NumThreads",
+ "Type": "Int",
+ "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.",
+ "Aliases": [
+ "nt",
+ "t",
+ "threads"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Columns to group together",
+ "Name": "ConvergenceTolerance",
+ "Type": "Float",
+ "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.",
"Aliases": [
- "col"
+ "tol"
],
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.01,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.001,
+ 0.01,
+ 0.1,
+ 0.2
+ ]
+ }
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.ConditionalNormalizer",
- "Desc": "Normalize the columns only if needed",
- "FriendlyName": "Normalize If Needed",
- "ShortName": null,
- "Inputs": [
- {
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "FixZero",
- "Type": "Bool",
- "Desc": "Whether to map zero to zero, preserving sparsity",
- "Aliases": [
- "zero"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "MaxTrainingExamples",
- "Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
- "Aliases": [
- "maxtrain"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Name": "MaxIterations",
+ "Type": "Int",
+ "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.",
"Aliases": [
- "col"
+ "iter"
],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ "",
+ 10,
+ 20,
+ 100
+ ]
+ }
},
{
- "Name": "FixZero",
+ "Name": "Shuffle",
"Type": "Bool",
- "Desc": "Whether to map zero to zero, preserving sparsity",
+ "Desc": "Shuffle data every epoch?",
"Aliases": [
- "zero"
+ "shuf"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
},
{
- "Name": "MaxTrainingExamples",
+ "Name": "CheckFrequency",
"Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
+ "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.",
"Aliases": [
- "maxtrain"
+ "checkFreq"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
- "Default": 1000000000
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ]
- },
- {
- "Name": "Transforms.DataCache",
- "Desc": "Caches using the specified cache option.",
- "FriendlyName": "Cache Data",
- "ShortName": null,
- "Inputs": [
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "Caching",
- "Type": {
- "Kind": "Enum",
+ "Name": "BiasLearningRate",
+ "Type": "Float",
+ "Desc": "The learning rate for adjusting bias from being regularized.",
+ "Aliases": [
+ "blr"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "SweepRange": {
+ "RangeType": "Discrete",
"Values": [
- "Memory",
- "Disk"
+ 0.0,
+ 0.01,
+ 0.1,
+ 1.0
]
- },
- "Desc": "Caching strategy",
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false,
- "Default": "Memory"
+ }
}
],
"Outputs": [
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Dataset"
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The trained model"
}
],
"InputKind": [
- "ITransformInput"
+ "ITrainerInputWithLabel",
+ "ITrainerInput"
+ ],
+ "OutputKind": [
+ "IRegressionOutput",
+ "ITrainerOutput"
]
},
{
- "Name": "Transforms.DatasetScorer",
- "Desc": "Score a dataset with a predictor model",
- "FriendlyName": null,
- "ShortName": null,
+ "Name": "Trainers.StochasticGradientDescentBinaryClassifier",
+ "Desc": "Train an Hogwild SGD binary model.",
+ "FriendlyName": "Hogwild SGD (binary)",
+ "ShortName": "HogwildSGD",
"Inputs": [
{
- "Name": "Data",
+ "Name": "TrainingData",
"Type": "DataView",
- "Desc": "The dataset to be scored",
+ "Desc": "The data to be used for training",
+ "Aliases": [
+ "data"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "The predictor model to apply to data",
- "Required": true,
+ "Name": "FeatureColumn",
+ "Type": "String",
+ "Desc": "Column to use for features",
+ "Aliases": [
+ "feat"
+ ],
+ "Required": false,
"SortOrder": 2.0,
- "IsNullable": false
+ "IsNullable": false,
+ "Default": "Features"
},
{
- "Name": "Suffix",
+ "Name": "LabelColumn",
"Type": "String",
- "Desc": "Suffix to append to the score columns",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
"Required": false,
"SortOrder": 3.0,
"IsNullable": false,
- "Default": null
- }
- ],
- "Outputs": [
- {
- "Name": "ScoredData",
- "Type": "DataView",
- "Desc": "The scored dataset"
- },
- {
- "Name": "ScoringTransform",
- "Type": "TransformModel",
- "Desc": "The scoring transform"
- }
- ]
- },
- {
- "Name": "Transforms.DatasetTransformScorer",
- "Desc": "Score a dataset with a transform model",
- "FriendlyName": null,
- "ShortName": null,
- "Inputs": [
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "The dataset to be scored",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "TransformModel",
- "Type": "TransformModel",
- "Desc": "The transform model to apply to data",
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "ScoredData",
- "Type": "DataView",
- "Desc": "The scored dataset"
+ "Default": "Label"
},
{
- "Name": "ScoringTransform",
- "Type": "TransformModel",
- "Desc": "The scoring transform"
- }
- ]
- },
- {
- "Name": "Transforms.Dictionarizer",
- "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.",
- "FriendlyName": "Term Transform",
- "ShortName": "TermTransform",
- "Inputs": [
- {
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "MaxNumTerms",
- "Type": "Int",
- "Desc": "Maximum number of terms to keep when auto-training",
- "Aliases": [
- "max"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Term",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "List of terms",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Sort",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
- },
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "TextKeyValues",
- "Type": "Bool",
- "Desc": "Whether key value metadata should be text, regardless of the actual input type",
- "Aliases": [
- "textkv"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "Column to use for example weight",
"Aliases": [
- "col"
+ "weight"
],
"Required": false,
- "SortOrder": 1.0,
+ "SortOrder": 4.0,
"IsNullable": false,
- "Default": null
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Default": "Weight"
},
{
- "Name": "MaxNumTerms",
- "Type": "Int",
- "Desc": "Maximum number of terms to keep per column when auto-training",
+ "Name": "NormalizeFeatures",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
+ ]
+ },
+ "Desc": "Normalize option for the feature column",
"Aliases": [
- "max"
+ "norm"
],
"Required": false,
"SortOrder": 5.0,
"IsNullable": false,
- "Default": 1000000
+ "Default": "Auto"
},
{
- "Name": "Term",
+ "Name": "Caching",
"Type": {
- "Kind": "Array",
- "ItemType": "String"
+ "Kind": "Enum",
+ "Values": [
+ "Auto",
+ "Memory",
+ "Disk",
+ "None"
+ ]
},
- "Desc": "List of terms",
+ "Desc": "Whether learner should cache input training data",
+ "Aliases": [
+ "cache"
+ ],
"Required": false,
- "SortOrder": 106.0,
+ "SortOrder": 6.0,
"IsNullable": false,
- "Default": null
+ "Default": "Auto"
},
{
- "Name": "Sort",
+ "Name": "LossFunction",
"Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
+ "Kind": "Component",
+ "ComponentKind": "ClassificationLossFunction"
},
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Desc": "Loss Function",
+ "Aliases": [
+ "loss"
+ ],
"Required": false,
- "SortOrder": 113.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": "Occurrence"
+ "Default": {
+ "Name": "LogLoss"
+ }
},
{
- "Name": "TextKeyValues",
- "Type": "Bool",
- "Desc": "Whether key value metadata should be text, regardless of the actual input type",
+ "Name": "L2Const",
+ "Type": "Float",
+ "Desc": "L2 regularizer constant",
"Aliases": [
- "textkv"
+ "l2"
],
"Required": false,
- "SortOrder": 114.0,
+ "SortOrder": 50.0,
"IsNullable": false,
- "Default": false
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Default": 1E-06,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1E-07,
+ 5E-07,
+ 1E-06,
+ 5E-06,
+ 1E-05
+ ]
+ }
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
+ "Name": "NumThreads",
+ "Type": "Int",
+ "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.",
+ "Aliases": [
+ "nt",
+ "t",
+ "threads"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "ConvergenceTolerance",
+ "Type": "Float",
+ "Desc": "Exponential moving averaged improvement tolerance for convergence",
+ "Aliases": [
+ "tol"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0001,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.01,
+ 0.001,
+ 0.0001,
+ 1E-05
+ ]
+ }
+ },
+ {
+ "Name": "MaxIterations",
+ "Type": "Int",
+ "Desc": "Maximum number of iterations; set to 1 to simulate online learning.",
+ "Aliases": [
+ "iter"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 20,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 5,
+ 10,
+ 20
+ ]
+ }
+ },
+ {
+ "Name": "InitLearningRate",
+ "Type": "Float",
+ "Desc": "Initial learning rate (only used by SGD)",
+ "Aliases": [
+ "ilr",
+ "lr"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.01
+ },
+ {
+ "Name": "Shuffle",
+ "Type": "Bool",
+ "Desc": "Shuffle data every epoch?",
+ "Aliases": [
+ "shuf"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
+ },
+ {
+ "Name": "PositiveInstanceWeight",
+ "Type": "Float",
+ "Desc": "Apply weight to the positive class, for imbalanced data",
+ "Aliases": [
+ "piw"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0
+ },
+ {
+ "Name": "CheckFrequency",
+ "Type": "Int",
+ "Desc": "Convergence check frequency (in terms of number of iterations). Default equals number of threads",
+ "Aliases": [
+ "checkFreq"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Calibrator",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "CalibratorTrainer"
+ },
+ "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "PlattCalibrator"
+ }
+ },
+ {
+ "Name": "MaxCalibrationExamples",
+ "Type": "Int",
+ "Desc": "The maximum number of examples to use when training the calibrator",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The trained model"
+ }
+ ],
"InputKind": [
- "ITransformInput"
+ "ITrainerInputWithWeight",
+ "ITrainerInputWithLabel",
+ "ITrainerInput"
],
"OutputKind": [
- "ITransformOutput"
+ "IBinaryClassificationOutput",
+ "ITrainerOutput"
]
},
{
- "Name": "Transforms.FeatureCombiner",
- "Desc": "Combines all the features into one feature column.",
- "FriendlyName": "Feature Combiner",
- "ShortName": "fc",
+ "Name": "Transforms.ApproximateBootstrapSampler",
+ "Desc": "Approximate bootstrap sampling.",
+ "FriendlyName": "Bootstrap Sample Transform",
+ "ShortName": "BootstrapSample",
"Inputs": [
{
"Name": "Data",
@@ -15297,16 +15530,49 @@
"IsNullable": false
},
{
- "Name": "Features",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Features",
+ "Name": "Complement",
+ "Type": "Bool",
+ "Desc": "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.",
+ "Aliases": [
+ "comp"
+ ],
"Required": false,
- "SortOrder": 2.0,
+ "SortOrder": 150.0,
"IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "The random seed. If unspecified random state will be instead derived from the environment.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
"Default": null
+ },
+ {
+ "Name": "ShuffleInput",
+ "Type": "Bool",
+ "Desc": "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.",
+ "Aliases": [
+ "si"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "PoolSize",
+ "Type": "Int",
+ "Desc": "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.",
+ "Aliases": [
+ "pool"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000
}
],
"Outputs": [
@@ -15329,43 +15595,25 @@
]
},
{
- "Name": "Transforms.FeatureSelectorByCount",
- "Desc": "Selects the slots for which the count of non-default values is greater than or equal to a threshold.",
- "FriendlyName": "Count Feature Selection Transform",
+ "Name": "Transforms.BinaryPredictionScoreColumnsRenamer",
+ "Desc": "For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.",
+ "FriendlyName": "Rename Binary Prediction Score Columns",
"ShortName": null,
"Inputs": [
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Columns to use for feature selection",
- "Aliases": [
- "col"
- ],
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "Count",
- "Type": "Int",
- "Desc": "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved",
- "Aliases": [
- "c"
- ],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false,
- "Default": 1
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The predictor model used in scoring",
"Required": true,
- "SortOrder": 1.0,
+ "SortOrder": 2.0,
"IsNullable": false
}
],
@@ -15389,95 +15637,10 @@
]
},
{
- "Name": "Transforms.FeatureSelectorByMutualInformation",
- "Desc": "Selects the top k slots across all specified columns ordered by their mutual information with the label column.",
- "FriendlyName": "Mutual Information Feature Selection Transform",
- "ShortName": "MIFeatureSelection",
- "Inputs": [
- {
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Columns to use for feature selection",
- "Aliases": [
- "col"
- ],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "SlotsInOutput",
- "Type": "Int",
- "Desc": "The maximum number of slots to preserve in output",
- "Aliases": [
- "topk",
- "numSlotsToKeep"
- ],
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": false,
- "Default": 1000
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "LabelColumn",
- "Type": "String",
- "Desc": "Column to use for labels",
- "Aliases": [
- "lab"
- ],
- "Required": false,
- "SortOrder": 4.0,
- "IsNullable": false,
- "Default": "Label"
- },
- {
- "Name": "NumBins",
- "Type": "Int",
- "Desc": "Max number of bins for R4/R8 columns, power of 2 recommended",
- "Aliases": [
- "bins"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 256
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.GlobalContrastNormalizer",
- "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.",
- "FriendlyName": "Global Contrast Normalization Transform",
- "ShortName": "Gcn",
+ "Name": "Transforms.BinNormalizer",
+ "Desc": "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.",
+ "FriendlyName": "Binning Normalizer",
+ "ShortName": "Bin",
"Inputs": [
{
"Name": "Column",
@@ -15487,27 +15650,36 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "UseStdDev",
- "Type": "Bool",
- "Desc": "Normalize by standard deviation rather than L2 norm",
+ "Name": "NumBins",
+ "Type": "Int",
+ "Desc": "Max number of bins, power of 2 recommended",
+ "Aliases": [
+ "bins"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
- "Name": "Scale",
- "Type": "Float",
- "Desc": "Scale features by this value",
+ "Name": "FixZero",
+ "Type": "Bool",
+ "Desc": "Whether to map zero to zero, preserving sparsity",
+ "Aliases": [
+ "zero"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
- "Name": "SubMean",
- "Type": "Bool",
- "Desc": "Subtract mean from each value before normalizing",
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
@@ -15549,15 +15721,6 @@
"IsNullable": false,
"Default": null
},
- {
- "Name": "SubMean",
- "Type": "Bool",
- "Desc": "Subtract mean from each value before normalizing",
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": false,
- "Default": true
- },
{
"Name": "Data",
"Type": "DataView",
@@ -15567,25 +15730,40 @@
"IsNullable": false
},
{
- "Name": "UseStdDev",
+ "Name": "NumBins",
+ "Type": "Int",
+ "Desc": "Max number of bins, power of 2 recommended",
+ "Aliases": [
+ "bins"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1024
+ },
+ {
+ "Name": "FixZero",
"Type": "Bool",
- "Desc": "Normalize by standard deviation rather than L2 norm",
+ "Desc": "Whether to map zero to zero, preserving sparsity",
"Aliases": [
- "useStd"
+ "zero"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": true
},
{
- "Name": "Scale",
- "Type": "Float",
- "Desc": "Scale features by this value",
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1.0
+ "Default": 1000000000
}
],
"Outputs": [
@@ -15608,10 +15786,10 @@
]
},
{
- "Name": "Transforms.HashConverter",
- "Desc": "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.",
- "FriendlyName": "Hash Join Transform",
- "ShortName": "HashJoin",
+ "Name": "Transforms.CategoricalHashOneHotVectorizer",
+ "Desc": "Encodes the categorical variable with hash-based encoding",
+ "FriendlyName": "Categorical Hash Transform",
+ "ShortName": null,
"Inputs": [
{
"Name": "Column",
@@ -15621,27 +15799,29 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "Join",
- "Type": "Bool",
- "Desc": "Whether the values need to be combined for a single hash",
+ "Name": "OutputKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Bag",
+ "Ind",
+ "Key",
+ "Bin"
+ ]
+ },
+ "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)",
+ "Aliases": [
+ "kind"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 102.0,
"IsNullable": true,
"Default": null
},
- {
- "Name": "CustomSlotMap",
- "Type": "String",
- "Desc": "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
{
"Name": "HashBits",
"Type": "Int",
- "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.",
+ "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.",
"Aliases": [
"bits"
],
@@ -15671,6 +15851,18 @@
"IsNullable": true,
"Default": null
},
+ {
+ "Name": "InvertHash",
+ "Type": "Int",
+ "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.",
+ "Aliases": [
+ "ih"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
{
"Name": "Name",
"Type": "String",
@@ -15698,7 +15890,7 @@
]
}
},
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Desc": "New column definition(s) (optional form: name:hashBits:src)",
"Aliases": [
"col"
],
@@ -15717,23 +15909,34 @@
{
"Name": "HashBits",
"Type": "Int",
- "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.",
+ "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.",
"Aliases": [
"bits"
],
"Required": false,
"SortOrder": 2.0,
"IsNullable": false,
- "Default": 31
+ "Default": 16
},
{
- "Name": "Join",
- "Type": "Bool",
- "Desc": "Whether the values need to be combined for a single hash",
+ "Name": "OutputKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Bag",
+ "Ind",
+ "Key",
+ "Bin"
+ ]
+ },
+ "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)",
+ "Aliases": [
+ "kind"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 102.0,
"IsNullable": false,
- "Default": true
+ "Default": "Bag"
},
{
"Name": "Seed",
@@ -15755,6 +15958,18 @@
"SortOrder": 150.0,
"IsNullable": false,
"Default": true
+ },
+ {
+ "Name": "InvertHash",
+ "Type": "Int",
+ "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.",
+ "Aliases": [
+ "ih"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0
}
],
"Outputs": [
@@ -15777,9 +15992,9 @@
]
},
{
- "Name": "Transforms.KeyToTextConverter",
- "Desc": "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.",
- "FriendlyName": "Key To Value Transform",
+ "Name": "Transforms.CategoricalOneHotVectorizer",
+ "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary",
+ "FriendlyName": "Categorical Transform",
"ShortName": null,
"Inputs": [
{
@@ -15790,11 +16005,82 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
+ "Name": "OutputKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Bag",
+ "Ind",
+ "Key",
+ "Bin"
+ ]
+ },
+ "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector",
"Aliases": [
- "name"
+ "kind"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "MaxNumTerms",
+ "Type": "Int",
+ "Desc": "Maximum number of terms to keep when auto-training",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Term",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of terms",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "TextKeyValues",
+ "Type": "Bool",
+ "Desc": "Whether key value metadata should be text, regardless of the actual input type",
+ "Aliases": [
+ "textkv"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
],
"Required": false,
"SortOrder": 150.0,
@@ -15831,55 +16117,75 @@
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
- }
- ],
- "Outputs": [
+ },
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "Name": "MaxNumTerms",
+ "Type": "Int",
+ "Desc": "Maximum number of terms to keep per column when auto-training",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": 1000000
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.LabelColumnKeyBooleanConverter",
- "Desc": "Transforms the label to either key or bool (if needed) to make it suitable for classification.",
- "FriendlyName": "Prepare Classification Label",
- "ShortName": null,
- "Inputs": [
+ "Name": "OutputKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Bag",
+ "Ind",
+ "Key",
+ "Bin"
+ ]
+ },
+ "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)",
+ "Aliases": [
+ "kind"
+ ],
+ "Required": false,
+ "SortOrder": 102.0,
+ "IsNullable": false,
+ "Default": "Ind"
+ },
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "Term",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of terms",
+ "Required": false,
+ "SortOrder": 106.0,
+ "IsNullable": false,
+ "Default": null
},
{
- "Name": "LabelColumn",
- "Type": "String",
- "Desc": "The label column",
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Required": false,
+ "SortOrder": 113.0,
+ "IsNullable": false,
+ "Default": "Occurrence"
},
{
"Name": "TextKeyValues",
"Type": "Bool",
- "Desc": "Convert the key values to text",
+ "Desc": "Whether key value metadata should be text, regardless of the actual input type",
+ "Aliases": [
+ "textkv"
+ ],
"Required": false,
- "SortOrder": 3.0,
+ "SortOrder": 114.0,
"IsNullable": false,
"Default": true
}
@@ -15904,10 +16210,10 @@
]
},
{
- "Name": "Transforms.LabelIndicator",
- "Desc": "Label remapper used by OVA",
- "FriendlyName": "LabelIndicator",
- "ShortName": "LabelIndictator",
+ "Name": "Transforms.CharacterTokenizer",
+ "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.",
+ "FriendlyName": "Character Tokenizer Transform",
+ "ShortName": "CharToken",
"Inputs": [
{
"Name": "Column",
@@ -15916,18 +16222,6 @@
"ItemType": {
"Kind": "Struct",
"Fields": [
- {
- "Name": "ClassIndex",
- "Type": "Int",
- "Desc": "The positive example class for binary classification.",
- "Aliases": [
- "index"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
{
"Name": "Name",
"Type": "String",
@@ -15959,10 +16253,9 @@
"Aliases": [
"col"
],
- "Required": false,
+ "Required": true,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
},
{
"Name": "Data",
@@ -15973,16 +16266,16 @@
"IsNullable": false
},
{
- "Name": "ClassIndex",
- "Type": "Int",
- "Desc": "Label of the positive class.",
+ "Name": "UseMarkerChars",
+ "Type": "Bool",
+ "Desc": "Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03)",
"Aliases": [
- "index"
+ "mark"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 2.0,
"IsNullable": false,
- "Default": 0
+ "Default": true
}
],
"Outputs": [
@@ -16005,25 +16298,62 @@
]
},
{
- "Name": "Transforms.LabelToFloatConverter",
- "Desc": "Transforms the label to float to make it suitable for regression.",
- "FriendlyName": "Prepare Regression Label",
- "ShortName": null,
+ "Name": "Transforms.ColumnConcatenator",
+ "Desc": "Concatenates two columns of the same item type.",
+ "FriendlyName": "Concat Transform",
+ "ShortName": "Concat",
"Inputs": [
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:srcs)",
+ "Aliases": [
+ "col"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "LabelColumn",
- "Type": "String",
- "Desc": "The label column",
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
"Required": true,
- "SortOrder": 2.0,
+ "SortOrder": 1.0,
"IsNullable": false
}
],
@@ -16047,19 +16377,11 @@
]
},
{
- "Name": "Transforms.LightLda",
- "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.",
- "FriendlyName": "Latent Dirichlet Allocation Transform",
- "ShortName": "LightLda",
+ "Name": "Transforms.ColumnCopier",
+ "Desc": "Duplicates columns from the dataset",
+ "FriendlyName": "Copy Columns Transform",
+ "ShortName": "Copy",
"Inputs": [
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
{
"Name": "Column",
"Type": {
@@ -16067,126 +16389,6 @@
"ItemType": {
"Kind": "Struct",
"Fields": [
- {
- "Name": "NumTopic",
- "Type": "Int",
- "Desc": "The number of topics in the LDA",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "AlphaSum",
- "Type": "Float",
- "Desc": "Dirichlet prior on document-topic vectors",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Beta",
- "Type": "Float",
- "Desc": "Dirichlet prior on vocab-topic vectors",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Mhstep",
- "Type": "Int",
- "Desc": "Number of Metropolis Hasting step",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "NumIterations",
- "Type": "Int",
- "Desc": "Number of iterations",
- "Aliases": [
- "iter"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "LikelihoodInterval",
- "Type": "Int",
- "Desc": "Compute log likelihood over local dataset on this iteration interval",
- "Aliases": [
- "llInterval"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "NumThreads",
- "Type": "Int",
- "Desc": "The number of training threads",
- "Aliases": [
- "t"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "NumMaxDocToken",
- "Type": "Int",
- "Desc": "The threshold of maximum count of tokens per doc",
- "Aliases": [
- "maxNumToken"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "NumSummaryTermPerTopic",
- "Type": "Int",
- "Desc": "The number of words to summarize the topic",
- "Aliases": [
- "ns"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "NumBurninIterations",
- "Type": "Int",
- "Desc": "The number of burn-in iterations",
- "Aliases": [
- "burninIter"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": 10
- },
- {
- "Name": "ResetRandomGenerator",
- "Type": "Bool",
- "Desc": "Reset the random number generator for each document",
- "Aliases": [
- "reset"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
{
"Name": "Name",
"Type": "String",
@@ -16214,199 +16416,118 @@
]
}
},
- "Desc": "New column definition(s) (optional form: name:srcs)",
+ "Desc": "New column definition(s) (optional form: name:src)",
"Aliases": [
"col"
],
"Required": true,
- "SortOrder": 49.0,
+ "SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "NumTopic",
- "Type": "Int",
- "Desc": "The number of topics in the LDA",
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 100,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 20,
- 40,
- 100,
- 200
- ]
- }
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
},
{
- "Name": "NumMaxDocToken",
- "Type": "Int",
- "Desc": "The threshold of maximum count of tokens per doc",
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.ColumnDropper",
+ "Desc": "Drops columns from the dataset",
+ "FriendlyName": "Drop Columns Transform",
+ "ShortName": "Drop",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Column name to drop",
"Aliases": [
- "maxNumToken"
+ "col"
],
- "Required": false,
- "SortOrder": 50.0,
- "IsNullable": false,
- "Default": 512
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
},
{
- "Name": "NumThreads",
- "Type": "Int",
- "Desc": "The number of training threads. Default value depends on number of logical processors.",
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.ColumnSelector",
+ "Desc": "Selects a set of columns, dropping all others",
+ "FriendlyName": "Select Columns",
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Column name to keep",
"Aliases": [
- "t"
+ "col"
],
"Required": false,
- "SortOrder": 50.0,
- "IsNullable": true,
+ "SortOrder": 1.0,
+ "IsNullable": false,
"Default": null
},
{
- "Name": "AlphaSum",
- "Type": "Float",
- "Desc": "Dirichlet prior on document-topic vectors",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 100.0,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 1,
- 10,
- 100,
- 200
- ]
- }
- },
- {
- "Name": "Beta",
- "Type": "Float",
- "Desc": "Dirichlet prior on vocab-topic vectors",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 0.01,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 0.01,
- 0.015,
- 0.07,
- 0.02
- ]
- }
- },
- {
- "Name": "Mhstep",
- "Type": "Int",
- "Desc": "Number of Metropolis Hasting step",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 4,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 2,
- 4,
- 8,
- 16
- ]
- }
- },
- {
- "Name": "NumIterations",
- "Type": "Int",
- "Desc": "Number of iterations",
- "Aliases": [
- "iter"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 200,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 100,
- 200,
- 300,
- 400
- ]
- }
- },
- {
- "Name": "LikelihoodInterval",
- "Type": "Int",
- "Desc": "Compute log likelihood over local dataset on this iteration interval",
- "Aliases": [
- "llInterval"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 5
- },
- {
- "Name": "NumSummaryTermPerTopic",
- "Type": "Int",
- "Desc": "The number of words to summarize the topic",
- "Aliases": [
- "ns"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 10
- },
- {
- "Name": "NumBurninIterations",
- "Type": "Int",
- "Desc": "The number of burn-in iterations",
- "Aliases": [
- "burninIter"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 10,
- "SweepRange": {
- "RangeType": "Discrete",
- "Values": [
- 10,
- 20,
- 30,
- 40
- ]
- }
- },
- {
- "Name": "ResetRandomGenerator",
- "Type": "Bool",
- "Desc": "Reset the random number generator for each document",
- "Aliases": [
- "reset"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": false
- },
- {
- "Name": "OutputTopicWordSummary",
- "Type": "Bool",
- "Desc": "Whether to output the topic-word summary in text format",
- "Aliases": [
- "summary"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": false
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
}
],
"Outputs": [
@@ -16429,10 +16550,10 @@
]
},
{
- "Name": "Transforms.LogMeanVarianceNormalizer",
- "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.",
- "FriendlyName": "LogMeanVar Normalizer",
- "ShortName": "LogMeanVar",
+ "Name": "Transforms.ColumnTypeConverter",
+ "Desc": "Converts a column to a different type, using standard conversions.",
+ "FriendlyName": "Convert Transform",
+ "ShortName": "Convert",
"Inputs": [
{
"Name": "Column",
@@ -16442,17 +16563,57 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "MaxTrainingExamples",
- "Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
+ "Name": "ResultType",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "I1",
+ "U1",
+ "I2",
+ "U2",
+ "I4",
+ "U4",
+ "I8",
+ "U8",
+ "R4",
+ "Num",
+ "R8",
+ "TX",
+ "Text",
+ "TXT",
+ "BL",
+ "Bool",
+ "TimeSpan",
+ "TS",
+ "DT",
+ "DateTime",
+ "DZ",
+ "DateTimeZone",
+ "UG",
+ "U16"
+ ]
+ },
+ "Desc": "The result type",
"Aliases": [
- "maxtrain"
+ "type"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
+ {
+ "Name": "Range",
+ "Type": "String",
+ "Desc": "For a key column, this defines the range of values",
+ "Aliases": [
+ "key"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
{
"Name": "Name",
"Type": "String",
@@ -16480,14 +16641,13 @@
]
}
},
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Desc": "New column definition(s) (optional form: name:type:src)",
"Aliases": [
"col"
],
- "Required": false,
+ "Required": true,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
},
{
"Name": "Data",
@@ -16498,28 +16658,56 @@
"IsNullable": false
},
{
- "Name": "UseCdf",
- "Type": "Bool",
- "Desc": "Whether to use CDF as the output",
+ "Name": "ResultType",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "I1",
+ "U1",
+ "I2",
+ "U2",
+ "I4",
+ "U4",
+ "I8",
+ "U8",
+ "R4",
+ "Num",
+ "R8",
+ "TX",
+ "Text",
+ "TXT",
+ "BL",
+ "Bool",
+ "TimeSpan",
+ "TS",
+ "DT",
+ "DateTime",
+ "DZ",
+ "DateTimeZone",
+ "UG",
+ "U16"
+ ]
+ },
+ "Desc": "The result type",
"Aliases": [
- "cdf"
+ "type"
],
"Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": true
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null
},
{
- "Name": "MaxTrainingExamples",
- "Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
- "Aliases": [
- "maxtrain"
+ "Name": "Range",
+ "Type": "String",
+ "Desc": "For a key column, this defines the range of values",
+ "Aliases": [
+ "key"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1000000000
+ "Default": null
}
],
"Outputs": [
@@ -16542,101 +16730,25 @@
]
},
{
- "Name": "Transforms.LpNormalizer",
- "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.",
- "FriendlyName": "Lp-Norm Normalizer",
- "ShortName": "lpnorm",
+ "Name": "Transforms.CombinerByContiguousGroupId",
+ "Desc": "Groups values of a scalar column into a vector, by a contiguous group ID",
+ "FriendlyName": "Group Transform",
+ "ShortName": "Group",
"Inputs": [
{
- "Name": "Column",
+ "Name": "GroupKey",
"Type": {
"Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "NormKind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "L2Norm",
- "StdDev",
- "L1Norm",
- "LInf"
- ]
- },
- "Desc": "The norm to use to normalize each sample",
- "Aliases": [
- "norm"
- ],
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "SubMean",
- "Type": "Bool",
- "Desc": "Subtract mean from each value before normalizing",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
- "Aliases": [
- "col"
- ],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "NormKind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "L2Norm",
- "StdDev",
- "L1Norm",
- "LInf"
- ]
+ "ItemType": "String"
},
- "Desc": "The norm to use to normalize each sample",
+ "Desc": "Columns to group by",
"Aliases": [
- "norm"
+ "g"
],
"Required": false,
"SortOrder": 1.0,
"IsNullable": false,
- "Default": "L2Norm"
+ "Default": null
},
{
"Name": "Data",
@@ -16647,13 +16759,18 @@
"IsNullable": false
},
{
- "Name": "SubMean",
- "Type": "Bool",
- "Desc": "Subtract mean from each value before normalizing",
- "Required": false,
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Columns to group together",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
"SortOrder": 2.0,
- "IsNullable": false,
- "Default": false
+ "IsNullable": false
}
],
"Outputs": [
@@ -16676,44 +16793,10 @@
]
},
{
- "Name": "Transforms.ManyHeterogeneousModelCombiner",
- "Desc": "Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.",
- "FriendlyName": null,
+ "Name": "Transforms.ConditionalNormalizer",
+ "Desc": "Normalize the columns only if needed",
+ "FriendlyName": "Normalize If Needed",
"ShortName": null,
- "Inputs": [
- {
- "Name": "TransformModels",
- "Type": {
- "Kind": "Array",
- "ItemType": "TransformModel"
- },
- "Desc": "Transform model",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "Predictor model",
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "Predictor model"
- }
- ]
- },
- {
- "Name": "Transforms.MeanVarianceNormalizer",
- "Desc": "Normalizes the data based on the computed mean and variance of the data.",
- "FriendlyName": "MeanVar Normalizer",
- "ShortName": "MeanVar",
"Inputs": [
{
"Name": "Column",
@@ -16789,18 +16872,6 @@
"SortOrder": 1.0,
"IsNullable": false
},
- {
- "Name": "UseCdf",
- "Type": "Bool",
- "Desc": "Whether to use CDF as the output",
- "Aliases": [
- "cdf"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": false
- },
{
"Name": "FixZero",
"Type": "Bool",
@@ -16840,83 +16911,14 @@
],
"InputKind": [
"ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
]
},
{
- "Name": "Transforms.MinMaxNormalizer",
- "Desc": "Normalizes the data based on the observed minimum and maximum values of the data.",
- "FriendlyName": "Min-Max Normalizer",
- "ShortName": "MinMax",
+ "Name": "Transforms.DataCache",
+ "Desc": "Caches using the specified cache option.",
+ "FriendlyName": "Cache Data",
+ "ShortName": null,
"Inputs": [
- {
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "FixZero",
- "Type": "Bool",
- "Desc": "Whether to map zero to zero, preserving sparsity",
- "Aliases": [
- "zero"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "MaxTrainingExamples",
- "Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
- "Aliases": [
- "maxtrain"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:src)",
- "Aliases": [
- "col"
- ],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
{
"Name": "Data",
"Type": "DataView",
@@ -16926,54 +16928,118 @@
"IsNullable": false
},
{
- "Name": "FixZero",
- "Type": "Bool",
- "Desc": "Whether to map zero to zero, preserving sparsity",
- "Aliases": [
- "zero"
- ],
- "Required": false,
- "SortOrder": 150.0,
+ "Name": "Caching",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Memory",
+ "Disk"
+ ]
+ },
+ "Desc": "Caching strategy",
+ "Required": true,
+ "SortOrder": 2.0,
"IsNullable": false,
- "Default": true
+ "Default": "Memory"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Dataset"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ]
+ },
+ {
+ "Name": "Transforms.DatasetScorer",
+ "Desc": "Score a dataset with a predictor model",
+ "FriendlyName": null,
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "The dataset to be scored",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
},
{
- "Name": "MaxTrainingExamples",
- "Type": "Int",
- "Desc": "Max number of examples used to train the normalizer",
- "Aliases": [
- "maxtrain"
- ],
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The predictor model to apply to data",
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Suffix",
+ "Type": "String",
+ "Desc": "Suffix to append to the score columns",
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 3.0,
"IsNullable": false,
- "Default": 1000000000
+ "Default": null
}
],
"Outputs": [
{
- "Name": "OutputData",
+ "Name": "ScoredData",
"Type": "DataView",
- "Desc": "Transformed dataset"
+ "Desc": "The scored dataset"
},
{
- "Name": "Model",
+ "Name": "ScoringTransform",
"Type": "TransformModel",
- "Desc": "Transform model"
+ "Desc": "The scoring transform"
+ }
+ ]
+ },
+ {
+ "Name": "Transforms.DatasetTransformScorer",
+ "Desc": "Score a dataset with a transform model",
+ "FriendlyName": null,
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "The dataset to be scored",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "TransformModel",
+ "Type": "TransformModel",
+ "Desc": "The transform model to apply to data",
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false
}
],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
+ "Outputs": [
+ {
+ "Name": "ScoredData",
+ "Type": "DataView",
+ "Desc": "The scored dataset"
+ },
+ {
+ "Name": "ScoringTransform",
+ "Type": "TransformModel",
+ "Desc": "The scoring transform"
+ }
]
},
{
- "Name": "Transforms.MissingValueHandler",
- "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.",
- "FriendlyName": "NA Handle Transform",
- "ShortName": "NAHandle",
+ "Name": "Transforms.Dictionarizer",
+ "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.",
+ "FriendlyName": "Term Transform",
+ "ShortName": "TermTransform",
"Inputs": [
{
"Name": "Column",
@@ -16983,40 +17049,50 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "Kind",
+ "Name": "MaxNumTerms",
+ "Type": "Int",
+ "Desc": "Maximum number of terms to keep when auto-training",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Term",
"Type": {
- "Kind": "Enum",
- "Values": [
- "DefaultValue",
- "Mean",
- "Minimum",
- "Maximum"
- ]
+ "Kind": "Array",
+ "ItemType": "String"
},
- "Desc": "The replacement method to utilize",
+ "Desc": "List of terms",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
+ "IsNullable": false,
"Default": null
},
{
- "Name": "ImputeBySlot",
- "Type": "Bool",
- "Desc": "Whether to impute values by slot",
- "Aliases": [
- "slot"
- ],
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
- "Name": "ConcatIndicator",
+ "Name": "TextKeyValues",
"Type": "Bool",
- "Desc": "Whether or not to concatenate an indicator vector column to the value column",
+ "Desc": "Whether key value metadata should be text, regardless of the actual input type",
"Aliases": [
- "ind"
+ "textkv"
],
"Required": false,
"SortOrder": 150.0,
@@ -17050,13 +17126,14 @@
]
}
},
- "Desc": "New column definition(s) (optional form: name:rep:src)",
+ "Desc": "New column definition(s) (optional form: name:src)",
"Aliases": [
"col"
],
- "Required": true,
+ "Required": false,
"SortOrder": 1.0,
- "IsNullable": false
+ "IsNullable": false,
+ "Default": null
},
{
"Name": "Data",
@@ -17067,48 +17144,55 @@
"IsNullable": false
},
{
- "Name": "ReplaceWith",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "DefaultValue",
- "Mean",
- "Minimum",
- "Maximum"
- ]
- },
- "Desc": "The replacement method to utilize",
+ "Name": "MaxNumTerms",
+ "Type": "Int",
+ "Desc": "Maximum number of terms to keep per column when auto-training",
"Aliases": [
- "kind"
+ "max"
],
"Required": false,
- "SortOrder": 2.0,
+ "SortOrder": 5.0,
"IsNullable": false,
- "Default": "Def"
+ "Default": 1000000
},
{
- "Name": "ImputeBySlot",
- "Type": "Bool",
- "Desc": "Whether to impute values by slot",
- "Aliases": [
- "slot"
- ],
+ "Name": "Term",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of terms",
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 106.0,
"IsNullable": false,
- "Default": true
+ "Default": null
},
{
- "Name": "Concat",
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Required": false,
+ "SortOrder": 113.0,
+ "IsNullable": false,
+ "Default": "Occurrence"
+ },
+ {
+ "Name": "TextKeyValues",
"Type": "Bool",
- "Desc": "Whether or not to concatenate an indicator vector column to the value column",
+ "Desc": "Whether key value metadata should be text, regardless of the actual input type",
"Aliases": [
- "ind"
+ "textkv"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 114.0,
"IsNullable": false,
- "Default": true
+ "Default": false
}
],
"Outputs": [
@@ -17131,60 +17215,30 @@
]
},
{
- "Name": "Transforms.MissingValueIndicator",
- "Desc": "Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.",
- "FriendlyName": "NA Indicator Transform",
- "ShortName": "NAInd",
+ "Name": "Transforms.FeatureCombiner",
+ "Desc": "Combines all the features into one feature column.",
+ "FriendlyName": "Feature Combiner",
+ "ShortName": "fc",
"Inputs": [
{
- "Name": "Column",
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Features",
"Type": {
"Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
+ "ItemType": "String"
},
- "Desc": "New column definition(s) (optional form: name:src)",
- "Aliases": [
- "col"
- ],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Desc": "Features",
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": null
}
],
"Outputs": [
@@ -17207,46 +17261,18 @@
]
},
{
- "Name": "Transforms.MissingValuesDropper",
- "Desc": "Removes NAs from vector columns.",
- "FriendlyName": "NA Drop Transform",
- "ShortName": "NADrop",
+ "Name": "Transforms.FeatureSelectorByCount",
+ "Desc": "Selects the slots for which the count of non-default values is greater than or equal to a threshold.",
+ "FriendlyName": "Count Feature Selection Transform",
+ "ShortName": null,
"Inputs": [
{
"Name": "Column",
"Type": {
"Kind": "Array",
- "ItemType": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- }
+ "ItemType": "String"
},
- "Desc": "Columns to drop the NAs for",
+ "Desc": "Columns to use for feature selection",
"Aliases": [
"col"
],
@@ -17254,6 +17280,18 @@
"SortOrder": 1.0,
"IsNullable": false
},
+ {
+ "Name": "Count",
+ "Type": "Int",
+ "Desc": "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved",
+ "Aliases": [
+ "c"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 1
+ },
{
"Name": "Data",
"Type": "DataView",
@@ -17283,10 +17321,10 @@
]
},
{
- "Name": "Transforms.MissingValuesRowDropper",
- "Desc": "Filters out rows that contain missing values.",
- "FriendlyName": "NA Filter",
- "ShortName": "NAFilter",
+ "Name": "Transforms.FeatureSelectorByMutualInformation",
+ "Desc": "Selects the top k slots across all specified columns ordered by their mutual information with the label column.",
+ "FriendlyName": "Mutual Information Feature Selection Transform",
+ "ShortName": "MIFeatureSelection",
"Inputs": [
{
"Name": "Column",
@@ -17294,7 +17332,7 @@
"Kind": "Array",
"ItemType": "String"
},
- "Desc": "Column",
+ "Desc": "Columns to use for feature selection",
"Aliases": [
"col"
],
@@ -17302,6 +17340,19 @@
"SortOrder": 1.0,
"IsNullable": false
},
+ {
+ "Name": "SlotsInOutput",
+ "Type": "Int",
+ "Desc": "The maximum number of slots to preserve in output",
+ "Aliases": [
+ "topk",
+ "numSlotsToKeep"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 1000
+ },
{
"Name": "Data",
"Type": "DataView",
@@ -17311,13 +17362,28 @@
"IsNullable": false
},
{
- "Name": "Complement",
- "Type": "Bool",
- "Desc": "If true, keep only rows that contain NA values, and filter the rest.",
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": "Label"
+ },
+ {
+ "Name": "NumBins",
+ "Type": "Int",
+ "Desc": "Max number of bins for R4/R8 columns, power of 2 recommended",
+ "Aliases": [
+ "bins"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": 256
}
],
"Outputs": [
@@ -17340,10 +17406,10 @@
]
},
{
- "Name": "Transforms.MissingValueSubstitutor",
- "Desc": "Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).",
- "FriendlyName": "NA Replace Transform",
- "ShortName": "NARep",
+ "Name": "Transforms.GlobalContrastNormalizer",
+ "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.",
+ "FriendlyName": "Global Contrast Normalization Transform",
+ "ShortName": "Gcn",
"Inputs": [
{
"Name": "Column",
@@ -17353,39 +17419,27 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "ReplacementString",
- "Type": "String",
- "Desc": "Replacement value for NAs (uses default value if not given)",
- "Aliases": [
- "rep"
- ],
+ "Name": "UseStdDev",
+ "Type": "Bool",
+ "Desc": "Normalize by standard deviation rather than L2 norm",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
+ "IsNullable": true,
"Default": null
},
{
- "Name": "Kind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "DefaultValue",
- "Mean",
- "Minimum",
- "Maximum",
- "SpecifiedValue"
- ]
- },
- "Desc": "The replacement method to utilize",
+ "Name": "Scale",
+ "Type": "Float",
+ "Desc": "Scale features by this value",
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
- "Name": "Slot",
+ "Name": "SubMean",
"Type": "Bool",
- "Desc": "Whether to impute values by slot",
+ "Desc": "Subtract mean from each value before normalizing",
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
@@ -17418,13 +17472,23 @@
]
}
},
- "Desc": "New column definition(s) (optional form: name:rep:src)",
+ "Desc": "New column definition(s) (optional form: name:src)",
"Aliases": [
"col"
],
- "Required": true,
+ "Required": false,
"SortOrder": 1.0,
- "IsNullable": false
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "SubMean",
+ "Type": "Bool",
+ "Desc": "Subtract mean from each value before normalizing",
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": true
},
{
"Name": "Data",
@@ -17435,37 +17499,25 @@
"IsNullable": false
},
{
- "Name": "ReplacementKind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "DefaultValue",
- "Mean",
- "Minimum",
- "Maximum",
- "SpecifiedValue"
- ]
- },
- "Desc": "The replacement method to utilize",
+ "Name": "UseStdDev",
+ "Type": "Bool",
+ "Desc": "Normalize by standard deviation rather than L2 norm",
"Aliases": [
- "kind"
+ "useStd"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": "Default"
+ "Default": false
},
{
- "Name": "ImputeBySlot",
- "Type": "Bool",
- "Desc": "Whether to impute values by slot",
- "Aliases": [
- "slot"
- ],
+ "Name": "Scale",
+ "Type": "Float",
+ "Desc": "Scale features by this value",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true
+ "Default": 1.0
}
],
"Outputs": [
@@ -17488,75 +17540,42 @@
]
},
{
- "Name": "Transforms.ModelCombiner",
- "Desc": "Combines a sequence of TransformModels into a single model",
- "FriendlyName": null,
- "ShortName": null,
+ "Name": "Transforms.HashConverter",
+ "Desc": "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.",
+ "FriendlyName": "Hash Join Transform",
+ "ShortName": "HashJoin",
"Inputs": [
{
- "Name": "Models",
- "Type": {
- "Kind": "Array",
- "ItemType": "TransformModel"
- },
- "Desc": "Input models",
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
- }
- ],
- "Outputs": [
- {
- "Name": "OutputModel",
- "Type": "TransformModel",
- "Desc": "Combined model"
- }
- ]
- },
- {
- "Name": "Transforms.NGramTranslator",
- "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.",
- "FriendlyName": "NGram Transform",
- "ShortName": "NgramTransform",
- "Inputs": [
- {
- "Name": "Column",
+ "Name": "Column",
"Type": {
"Kind": "Array",
"ItemType": {
"Kind": "Struct",
"Fields": [
{
- "Name": "NgramLength",
- "Type": "Int",
- "Desc": "Maximum ngram length",
- "Aliases": [
- "ngram"
- ],
+ "Name": "Join",
+ "Type": "Bool",
+ "Desc": "Whether the values need to be combined for a single hash",
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
},
{
- "Name": "AllLengths",
- "Type": "Bool",
- "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength",
- "Aliases": [
- "all"
- ],
+ "Name": "CustomSlotMap",
+ "Type": "String",
+ "Desc": "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
+ "IsNullable": false,
"Default": null
},
{
- "Name": "SkipLength",
+ "Name": "HashBits",
"Type": "Int",
- "Desc": "Maximum number of tokens to skip when constructing an ngram",
+ "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.",
"Aliases": [
- "skips"
+ "bits"
],
"Required": false,
"SortOrder": 150.0,
@@ -17564,31 +17583,21 @@
"Default": null
},
{
- "Name": "MaxNumTerms",
- "Type": {
- "Kind": "Array",
- "ItemType": "Int"
- },
- "Desc": "Maximum number of ngrams to store in the dictionary",
- "Aliases": [
- "max"
- ],
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "Hashing seed",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
+ "IsNullable": true,
"Default": null
},
{
- "Name": "Weighting",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Tf",
- "Idf",
- "TfIdf"
- ]
- },
- "Desc": "Statistical measure used to evaluate how important a word is to a document in a corpus",
+ "Name": "Ordered",
+ "Type": "Bool",
+ "Desc": "Whether the position of each term should be included in the hash",
+ "Aliases": [
+ "ord"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
@@ -17625,10 +17634,9 @@
"Aliases": [
"col"
],
- "Required": false,
+ "Required": true,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
},
{
"Name": "Data",
@@ -17639,73 +17647,46 @@
"IsNullable": false
},
{
- "Name": "NgramLength",
+ "Name": "HashBits",
"Type": "Int",
- "Desc": "Maximum ngram length",
+ "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.",
"Aliases": [
- "ngram"
+ "bits"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 2.0,
"IsNullable": false,
- "Default": 2
+ "Default": 31
},
{
- "Name": "AllLengths",
+ "Name": "Join",
"Type": "Bool",
- "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength",
- "Aliases": [
- "all"
- ],
+ "Desc": "Whether the values need to be combined for a single hash",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": true
},
{
- "Name": "SkipLength",
- "Type": "Int",
- "Desc": "Maximum number of tokens to skip when constructing an ngram",
- "Aliases": [
- "skips"
- ],
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "Hashing seed",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 0
+ "Default": 314489979
},
{
- "Name": "MaxNumTerms",
- "Type": {
- "Kind": "Array",
- "ItemType": "Int"
- },
- "Desc": "Maximum number of ngrams to store in the dictionary",
+ "Name": "Ordered",
+ "Type": "Bool",
+ "Desc": "Whether the position of each term should be included in the hash",
"Aliases": [
- "max"
+ "ord"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": [
- 10000000
- ]
- },
- {
- "Name": "Weighting",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Tf",
- "Idf",
- "TfIdf"
- ]
- },
- "Desc": "The weighting criteria",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": "Tf"
+ "Default": true
}
],
"Outputs": [
@@ -17728,11 +17709,53 @@
]
},
{
- "Name": "Transforms.NoOperation",
- "Desc": "Does nothing.",
- "FriendlyName": "No Op",
- "ShortName": "Nop",
+ "Name": "Transforms.KeyToTextConverter",
+ "Desc": "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.",
+ "FriendlyName": "Key To Value Transform",
+ "ShortName": null,
"Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
{
"Name": "Data",
"Type": "DataView",
@@ -17762,32 +17785,35 @@
]
},
{
- "Name": "Transforms.OptionalColumnCreator",
- "Desc": "If the source column does not exist after deserialization, create a column with the right type and default values.",
- "FriendlyName": "Optional Column Transform",
- "ShortName": "optional",
+ "Name": "Transforms.LabelColumnKeyBooleanConverter",
+ "Desc": "Transforms the label to either key or bool (if needed) to make it suitable for classification.",
+ "FriendlyName": "Prepare Classification Label",
+ "ShortName": null,
"Inputs": [
{
- "Name": "Column",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "New column definition(s)",
- "Aliases": [
- "col"
- ],
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "The label column",
"Required": true,
- "SortOrder": 1.0,
+ "SortOrder": 2.0,
"IsNullable": false
+ },
+ {
+ "Name": "TextKeyValues",
+ "Type": "Bool",
+ "Desc": "Convert the key values to text",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": true
}
],
"Outputs": [
@@ -17810,10 +17836,10 @@
]
},
{
- "Name": "Transforms.PcaCalculator",
- "Desc": "Train an PCA Anomaly model.",
- "FriendlyName": "Principal Component Analysis Transform",
- "ShortName": "Pca",
+ "Name": "Transforms.LabelIndicator",
+ "Desc": "Label remapper used by OVA",
+ "FriendlyName": "LabelIndicator",
+ "ShortName": "LabelIndictator",
"Inputs": [
{
"Name": "Column",
@@ -17823,87 +17849,39 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "WeightColumn",
- "Type": "String",
- "Desc": "The name of the weight column",
+ "Name": "ClassIndex",
+ "Type": "Int",
+ "Desc": "The positive example class for binary classification.",
"Aliases": [
- "weight"
+ "index"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
+ "IsNullable": true,
"Default": null
},
{
- "Name": "Rank",
- "Type": "Int",
- "Desc": "The number of components in the PCA",
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
"Aliases": [
- "k"
+ "name"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
+ "IsNullable": false,
"Default": null
},
{
- "Name": "Oversampling",
- "Type": "Int",
- "Desc": "Oversampling parameter for randomized PCA training",
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
"Aliases": [
- "over"
+ "src"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Center",
- "Type": "Bool",
- "Desc": "If enabled, data is centered to be zero mean",
- "Aliases": [
- "center"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Seed",
- "Type": "Int",
- "Desc": "The seed for random number generation",
- "Aliases": [
- "seed"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
+ "IsNullable": false,
"Default": null
}
]
@@ -17913,9 +17891,10 @@
"Aliases": [
"col"
],
- "Required": true,
+ "Required": false,
"SortOrder": 1.0,
- "IsNullable": false
+ "IsNullable": false,
+ "Default": null
},
{
"Name": "Data",
@@ -17926,57 +17905,15 @@
"IsNullable": false
},
{
- "Name": "WeightColumn",
- "Type": "String",
- "Desc": "The name of the weight column",
- "Aliases": [
- "weight"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Rank",
- "Type": "Int",
- "Desc": "The number of components in the PCA",
- "Aliases": [
- "k"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 20
- },
- {
- "Name": "Oversampling",
+ "Name": "ClassIndex",
"Type": "Int",
- "Desc": "Oversampling parameter for randomized PCA training",
+ "Desc": "Label of the positive class.",
"Aliases": [
- "over"
+ "index"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 20
- },
- {
- "Name": "Center",
- "Type": "Bool",
- "Desc": "If enabled, data is centered to be zero mean",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": true
- },
- {
- "Name": "Seed",
- "Type": "Int",
- "Desc": "The seed for random number generation",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
"Default": 0
}
],
@@ -18000,9 +17937,9 @@
]
},
{
- "Name": "Transforms.PredictedLabelColumnOriginalValueConverter",
- "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.",
- "FriendlyName": "Convert Predicted Label",
+ "Name": "Transforms.LabelToFloatConverter",
+ "Desc": "Transforms the label to float to make it suitable for regression.",
+ "FriendlyName": "Prepare Regression Label",
"ShortName": null,
"Inputs": [
{
@@ -18014,9 +17951,9 @@
"IsNullable": false
},
{
- "Name": "PredictedLabelColumn",
+ "Name": "LabelColumn",
"Type": "String",
- "Desc": "The predicted label column",
+ "Desc": "The label column",
"Required": true,
"SortOrder": 2.0,
"IsNullable": false
@@ -18042,11 +17979,19 @@
]
},
{
- "Name": "Transforms.RandomNumberGenerator",
- "Desc": "Adds a column with a generated number sequence.",
- "FriendlyName": "Generate Number Transform",
- "ShortName": "Generate",
+ "Name": "Transforms.LightLda",
+ "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.",
+ "FriendlyName": "Latent Dirichlet Allocation Transform",
+ "ShortName": "LightLda",
"Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
{
"Name": "Column",
"Type": {
@@ -18055,23 +18000,59 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
+ "Name": "NumTopic",
+ "Type": "Int",
+ "Desc": "The number of topics in the LDA",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "AlphaSum",
+ "Type": "Float",
+ "Desc": "Dirichlet prior on document-topic vectors",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Beta",
+ "Type": "Float",
+ "Desc": "Dirichlet prior on vocab-topic vectors",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Mhstep",
+ "Type": "Int",
+ "Desc": "Number of Metropolis Hasting step",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "NumIterations",
+ "Type": "Int",
+ "Desc": "Number of iterations",
"Aliases": [
- "name"
+ "iter"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
+ "IsNullable": true,
"Default": null
},
{
- "Name": "UseCounter",
- "Type": "Bool",
- "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Name": "LikelihoodInterval",
+ "Type": "Int",
+ "Desc": "Compute log likelihood over local dataset on this iteration interval",
"Aliases": [
- "cnt"
+ "llInterval"
],
"Required": false,
"SortOrder": 150.0,
@@ -18079,249 +18060,285 @@
"Default": null
},
{
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "The random seed",
+ "Name": "NumThreads",
+ "Type": "Int",
+ "Desc": "The number of training threads",
+ "Aliases": [
+ "t"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": true,
"Default": null
- }
- ]
- }
- },
- "Desc": "New column definition(s) (optional form: name:seed)",
- "Aliases": [
- "col"
- ],
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "UseCounter",
- "Type": "Bool",
- "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
- "Aliases": [
- "cnt"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": false
- },
- {
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "The random seed",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 42
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.RowRangeFilter",
- "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.",
- "FriendlyName": "Range Filter",
- "ShortName": "RangeFilter",
- "Inputs": [
- {
- "Name": "Column",
- "Type": "String",
- "Desc": "Column",
+ },
+ {
+ "Name": "NumMaxDocToken",
+ "Type": "Int",
+ "Desc": "The threshold of maximum count of tokens per doc",
+ "Aliases": [
+ "maxNumToken"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "NumSummaryTermPerTopic",
+ "Type": "Int",
+ "Desc": "The number of words to summarize the topic",
+ "Aliases": [
+ "ns"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "NumBurninIterations",
+ "Type": "Int",
+ "Desc": "The number of burn-in iterations",
+ "Aliases": [
+ "burninIter"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": 10
+ },
+ {
+ "Name": "ResetRandomGenerator",
+ "Type": "Bool",
+ "Desc": "Reset the random number generator for each document",
+ "Aliases": [
+ "reset"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:srcs)",
"Aliases": [
"col"
],
"Required": true,
- "SortOrder": 1.0,
+ "SortOrder": 49.0,
"IsNullable": false
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "NumTopic",
+ "Type": "Int",
+ "Desc": "The number of topics in the LDA",
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 100,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 20,
+ 40,
+ 100,
+ 200
+ ]
+ }
},
{
- "Name": "Min",
- "Type": "Float",
- "Desc": "Minimum value (0 to 1 for key types)",
+ "Name": "NumMaxDocToken",
+ "Type": "Int",
+ "Desc": "The threshold of maximum count of tokens per doc",
+ "Aliases": [
+ "maxNumToken"
+ ],
"Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 512
},
{
- "Name": "Max",
- "Type": "Float",
- "Desc": "Maximum value (0 to 1 for key types)",
+ "Name": "NumThreads",
+ "Type": "Int",
+ "Desc": "The number of training threads. Default value depends on number of logical processors.",
+ "Aliases": [
+ "t"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 50.0,
"IsNullable": true,
"Default": null
},
{
- "Name": "Complement",
- "Type": "Bool",
- "Desc": "If true, keep the values that fall outside the range.",
+ "Name": "AlphaSum",
+ "Type": "Float",
+ "Desc": "Dirichlet prior on document-topic vectors",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": false
+ "Default": 100.0,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 10,
+ 100,
+ 200
+ ]
+ }
},
{
- "Name": "IncludeMin",
- "Type": "Bool",
- "Desc": "If true, include in the range the values that are equal to min.",
+ "Name": "Beta",
+ "Type": "Float",
+ "Desc": "Dirichlet prior on vocab-topic vectors",
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": true
+ "Default": 0.01,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.01,
+ 0.015,
+ 0.07,
+ 0.02
+ ]
+ }
},
{
- "Name": "IncludeMax",
- "Type": "Bool",
- "Desc": "If true, include in the range the values that are equal to max.",
+ "Name": "Mhstep",
+ "Type": "Int",
+ "Desc": "Number of Metropolis Hasting step",
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "IsNullable": false,
+ "Default": 4,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 2,
+ 4,
+ 8,
+ 16
+ ]
+ }
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.RowSkipAndTakeFilter",
- "Desc": "Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging.",
- "FriendlyName": "Skip and Take Filter",
- "ShortName": "SkipTake",
- "Inputs": [
- {
- "Name": "Skip",
+ "Name": "NumIterations",
"Type": "Int",
- "Desc": "Number of items to skip",
+ "Desc": "Number of iterations",
"Aliases": [
- "s"
+ "iter"
],
"Required": false,
- "SortOrder": 1.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 200,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 100,
+ 200,
+ 300,
+ 400
+ ]
+ }
},
{
- "Name": "Take",
+ "Name": "LikelihoodInterval",
"Type": "Int",
- "Desc": "Number of items to take",
+ "Desc": "Compute log likelihood over local dataset on this iteration interval",
"Aliases": [
- "t"
+ "llInterval"
],
"Required": false,
- "SortOrder": 2.0,
- "IsNullable": true,
- "Default": null
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 5
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.RowSkipFilter",
- "Desc": "Allows limiting input to a subset of rows by skipping a number of rows.",
- "FriendlyName": "Skip Filter",
- "ShortName": "Skip",
- "Inputs": [
+ "Name": "NumSummaryTermPerTopic",
+ "Type": "Int",
+ "Desc": "The number of words to summarize the topic",
+ "Aliases": [
+ "ns"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 10
+ },
{
- "Name": "Count",
+ "Name": "NumBurninIterations",
"Type": "Int",
- "Desc": "Number of items to skip",
+ "Desc": "The number of burn-in iterations",
"Aliases": [
- "c",
- "n",
- "s"
+ "burninIter"
],
- "Required": true,
- "SortOrder": 1.0,
+ "Required": false,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0
+ "Default": 10,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10,
+ 20,
+ 30,
+ 40
+ ]
+ }
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "ResetRandomGenerator",
+ "Type": "Bool",
+ "Desc": "Reset the random number generator for each document",
+ "Aliases": [
+ "reset"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "OutputTopicWordSummary",
+ "Type": "Bool",
+ "Desc": "Whether to output the topic-word summary in text format",
+ "Aliases": [
+ "summary"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
}
],
"Outputs": [
@@ -18344,24 +18361,65 @@
]
},
{
- "Name": "Transforms.RowTakeFilter",
- "Desc": "Allows limiting input to a subset of rows by taking N first rows.",
- "FriendlyName": "Take Filter",
- "ShortName": "Take",
+ "Name": "Transforms.LogMeanVarianceNormalizer",
+ "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.",
+ "FriendlyName": "LogMeanVar Normalizer",
+ "ShortName": "LogMeanVar",
"Inputs": [
{
- "Name": "Count",
- "Type": "Int",
- "Desc": "Number of items to take",
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
"Aliases": [
- "c",
- "n",
- "t"
+ "col"
],
- "Required": true,
+ "Required": false,
"SortOrder": 1.0,
"IsNullable": false,
- "Default": 9223372036854775807
+ "Default": null
},
{
"Name": "Data",
@@ -18370,52 +18428,30 @@
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
},
{
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
- }
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
- ]
- },
- {
- "Name": "Transforms.ScoreColumnSelector",
- "Desc": "Selects only the last score columns and the extra columns specified in the arguments.",
- "FriendlyName": "Choose Columns By Index",
- "ShortName": null,
- "Inputs": [
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
+ "Name": "UseCdf",
+ "Type": "Bool",
+ "Desc": "Whether to use CDF as the output",
+ "Aliases": [
+ "cdf"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
},
{
- "Name": "ExtraColumns",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Extra columns to write",
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
"Required": false,
- "SortOrder": 2.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": null
+ "Default": 1000000000
}
],
"Outputs": [
@@ -18438,76 +18474,118 @@
]
},
{
- "Name": "Transforms.Scorer",
- "Desc": "Turn the predictor model into a transform model",
- "FriendlyName": null,
- "ShortName": null,
- "Inputs": [
- {
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "The predictor model to turn into a transform",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "ScoredData",
- "Type": "DataView",
- "Desc": "The scored dataset"
- },
- {
- "Name": "ScoringTransform",
- "Type": "TransformModel",
- "Desc": "The scoring transform"
- }
- ]
- },
- {
- "Name": "Transforms.Segregator",
- "Desc": "Un-groups vector columns into sequences of rows, inverse of Group transform",
- "FriendlyName": "Un-group Transform",
- "ShortName": "Ungroup",
+ "Name": "Transforms.LpNormalizer",
+ "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.",
+ "FriendlyName": "Lp-Norm Normalizer",
+ "ShortName": "lpnorm",
"Inputs": [
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
{
"Name": "Column",
"Type": {
"Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Columns to unroll, or 'pivot'",
- "Aliases": [
- "col"
- ],
- "Required": true,
- "SortOrder": 150.0,
- "IsNullable": false
- },
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "NormKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "L2Norm",
+ "StdDev",
+ "L1Norm",
+ "LInf"
+ ]
+ },
+ "Desc": "The norm to use to normalize each sample",
+ "Aliases": [
+ "norm"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "SubMean",
+ "Type": "Bool",
+ "Desc": "Subtract mean from each value before normalizing",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
{
- "Name": "Mode",
+ "Name": "NormKind",
"Type": {
"Kind": "Enum",
"Values": [
- "Inner",
- "Outer",
- "First"
+ "L2Norm",
+ "StdDev",
+ "L1Norm",
+ "LInf"
]
},
- "Desc": "Specifies how to unroll multiple pivot columns of different size.",
+ "Desc": "The norm to use to normalize each sample",
+ "Aliases": [
+ "norm"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 1.0,
"IsNullable": false,
- "Default": "Inner"
+ "Default": "L2Norm"
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "SubMean",
+ "Type": "Bool",
+ "Desc": "Subtract mean from each value before normalizing",
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": false
}
],
"Outputs": [
@@ -18530,67 +18608,44 @@
]
},
{
- "Name": "Transforms.SentimentAnalyzer",
- "Desc": "Uses a pretrained sentiment model to score input strings",
- "FriendlyName": "Sentiment Analyzing Transform",
- "ShortName": "Senti",
+ "Name": "Transforms.ManyHeterogeneousModelCombiner",
+ "Desc": "Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.",
+ "FriendlyName": null,
+ "ShortName": null,
"Inputs": [
{
- "Name": "Source",
- "Type": "String",
- "Desc": "Name of the source column.",
- "Aliases": [
- "col"
- ],
+ "Name": "TransformModels",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "TransformModel"
+ },
+ "Desc": "Transform model",
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "Predictor model",
"Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column.",
- "Aliases": [
- "dst"
- ],
- "Required": false,
"SortOrder": 2.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
}
],
"Outputs": [
{
- "Name": "OutputData",
- "Type": "DataView",
- "Desc": "Transformed dataset"
- },
- {
- "Name": "Model",
- "Type": "TransformModel",
- "Desc": "Transform model"
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "Predictor model"
}
- ],
- "InputKind": [
- "ITransformInput"
- ],
- "OutputKind": [
- "ITransformOutput"
]
},
{
- "Name": "Transforms.SupervisedBinNormalizer",
- "Desc": "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins.",
- "FriendlyName": "Supervised Binning Normalizer",
- "ShortName": "SupBin",
+ "Name": "Transforms.MeanVarianceNormalizer",
+ "Desc": "Normalizes the data based on the computed mean and variance of the data.",
+ "FriendlyName": "MeanVar Normalizer",
+ "ShortName": "MeanVar",
"Inputs": [
{
"Name": "Column",
@@ -18599,18 +18654,6 @@
"ItemType": {
"Kind": "Struct",
"Fields": [
- {
- "Name": "NumBins",
- "Type": "Int",
- "Desc": "Max number of bins, power of 2 recommended",
- "Aliases": [
- "bins"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
{
"Name": "FixZero",
"Type": "Bool",
@@ -18666,10 +18709,9 @@
"Aliases": [
"col"
],
- "Required": false,
+ "Required": true,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
},
{
"Name": "Data",
@@ -18680,37 +18722,16 @@
"IsNullable": false
},
{
- "Name": "LabelColumn",
- "Type": "String",
- "Desc": "Label column for supervised binning",
- "Aliases": [
- "label",
- "lab"
- ],
- "Required": true,
- "SortOrder": 150.0,
- "IsNullable": false
- },
- {
- "Name": "MinBinSize",
- "Type": "Int",
- "Desc": "Minimum number of examples per bin",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": 10
- },
- {
- "Name": "NumBins",
- "Type": "Int",
- "Desc": "Max number of bins, power of 2 recommended",
+ "Name": "UseCdf",
+ "Type": "Bool",
+ "Desc": "Whether to use CDF as the output",
"Aliases": [
- "bins"
+ "cdf"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1024
+ "Default": false
},
{
"Name": "FixZero",
@@ -18757,47 +18778,71 @@
]
},
{
- "Name": "Transforms.TextFeaturizer",
- "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.",
- "FriendlyName": "Text Transform",
- "ShortName": "Text",
+ "Name": "Transforms.MinMaxNormalizer",
+ "Desc": "Normalizes the data based on the observed minimum and maximum values of the data.",
+ "FriendlyName": "Min-Max Normalizer",
+ "ShortName": "MinMax",
"Inputs": [
{
"Name": "Column",
"Type": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Name",
- "Type": "String",
- "Desc": "Name of the new column",
- "Aliases": [
- "name"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Source",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Name of the source column",
- "Aliases": [
- "src"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- }
- ]
- },
- "Desc": "New column definition (optional form: name:srcs).",
- "Aliases": [
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "FixZero",
+ "Type": "Bool",
+ "Desc": "Whether to map zero to zero, preserving sparsity",
+ "Aliases": [
+ "zero"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
"col"
],
"Required": true,
@@ -18813,232 +18858,189 @@
"IsNullable": false
},
{
- "Name": "Language",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "English",
- "French",
- "German",
- "Dutch",
- "Italian",
- "Spanish",
- "Japanese"
- ]
- },
- "Desc": "Dataset language or 'AutoDetect' to detect language per row.",
+ "Name": "FixZero",
+ "Type": "Bool",
+ "Desc": "Whether to map zero to zero, preserving sparsity",
"Aliases": [
- "lang"
+ "zero"
],
"Required": false,
- "SortOrder": 3.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": "English"
+ "Default": true
},
{
- "Name": "StopWordsRemover",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "StopWordsRemover"
- },
- "Desc": "Stopwords remover.",
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
"Aliases": [
- "remover"
+ "maxtrain"
],
"Required": false,
- "SortOrder": 4.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": null
+ "Default": 1000000000
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
},
{
- "Name": "TextCase",
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.MissingValueHandler",
+ "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.",
+ "FriendlyName": "NA Handle Transform",
+ "ShortName": "NAHandle",
+ "Inputs": [
+ {
+ "Name": "Column",
"Type": {
- "Kind": "Enum",
- "Values": [
- "Lower",
- "Upper",
- "None"
- ]
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Kind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultValue",
+ "Mean",
+ "Minimum",
+ "Maximum"
+ ]
+ },
+ "Desc": "The replacement method to utilize",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "ImputeBySlot",
+ "Type": "Bool",
+ "Desc": "Whether to impute values by slot",
+ "Aliases": [
+ "slot"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "ConcatIndicator",
+ "Type": "Bool",
+ "Desc": "Whether or not to concatenate an indicator vector column to the value column",
+ "Aliases": [
+ "ind"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
},
- "Desc": "Casing text using the rules of the invariant culture.",
+ "Desc": "New column definition(s) (optional form: name:rep:src)",
"Aliases": [
- "case"
+ "col"
],
- "Required": false,
- "SortOrder": 5.0,
- "IsNullable": false,
- "Default": "Lower"
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
},
{
- "Name": "KeepDiacritics",
- "Type": "Bool",
- "Desc": "Whether to keep diacritical marks or remove them.",
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "ReplaceWith",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultValue",
+ "Mean",
+ "Minimum",
+ "Maximum"
+ ]
+ },
+ "Desc": "The replacement method to utilize",
"Aliases": [
- "diac"
+ "kind"
],
"Required": false,
- "SortOrder": 6.0,
+ "SortOrder": 2.0,
"IsNullable": false,
- "Default": false
+ "Default": "Def"
},
{
- "Name": "KeepPunctuations",
+ "Name": "ImputeBySlot",
"Type": "Bool",
- "Desc": "Whether to keep punctuation marks or remove them.",
+ "Desc": "Whether to impute values by slot",
"Aliases": [
- "punc"
+ "slot"
],
"Required": false,
- "SortOrder": 7.0,
+ "SortOrder": 150.0,
"IsNullable": false,
"Default": true
},
{
- "Name": "KeepNumbers",
+ "Name": "Concat",
"Type": "Bool",
- "Desc": "Whether to keep numbers or remove them.",
+ "Desc": "Whether or not to concatenate an indicator vector column to the value column",
"Aliases": [
- "num"
+ "ind"
],
"Required": false,
- "SortOrder": 8.0,
+ "SortOrder": 150.0,
"IsNullable": false,
"Default": true
- },
- {
- "Name": "OutputTokens",
- "Type": "Bool",
- "Desc": "Whether to output the transformed text tokens as an additional column.",
- "Aliases": [
- "tokens",
- "showtext",
- "showTransformedText"
- ],
- "Required": false,
- "SortOrder": 9.0,
- "IsNullable": false,
- "Default": false
- },
- {
- "Name": "Dictionary",
- "Type": {
- "Kind": "Struct",
- "Fields": [
- {
- "Name": "Term",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "List of terms",
- "Required": false,
- "SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Sort",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
- },
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
- "Required": false,
- "SortOrder": 5.0,
- "IsNullable": false,
- "Default": "Occurrence"
- },
- {
- "Name": "DropUnknowns",
- "Type": "Bool",
- "Desc": "Drop unknown terms instead of mapping them to NA term.",
- "Aliases": [
- "dropna"
- ],
- "Required": false,
- "SortOrder": 6.0,
- "IsNullable": false,
- "Default": false
- }
- ]
- },
- "Desc": "A dictionary of whitelisted terms.",
- "Aliases": [
- "dict"
- ],
- "Required": false,
- "SortOrder": 10.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "WordFeatureExtractor",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "NgramExtractor"
- },
- "Desc": "Ngram feature extractor to use for words (WordBag/WordHashBag).",
- "Aliases": [
- "wordExtractor"
- ],
- "Required": false,
- "SortOrder": 11.0,
- "IsNullable": false,
- "Default": {
- "Name": "NGram",
- "Settings": {
- "MaxNumTerms": [
- 10000000
- ]
- }
- }
- },
- {
- "Name": "CharFeatureExtractor",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "NgramExtractor"
- },
- "Desc": "Ngram feature extractor to use for characters (WordBag/WordHashBag).",
- "Aliases": [
- "charExtractor"
- ],
- "Required": false,
- "SortOrder": 12.0,
- "IsNullable": false,
- "Default": {
- "Name": "NGram",
- "Settings": {
- "NgramLength": 3,
- "AllLengths": false,
- "MaxNumTerms": [
- 10000000
- ]
- }
- }
- },
- {
- "Name": "VectorNormalizer",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "None",
- "L1",
- "L2",
- "LInf"
- ]
- },
- "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.",
- "Aliases": [
- "norm"
- ],
- "Required": false,
- "SortOrder": 13.0,
- "IsNullable": false,
- "Default": "L2"
}
],
"Outputs": [
@@ -19061,10 +19063,10 @@
]
},
{
- "Name": "Transforms.TextToKeyConverter",
- "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.",
- "FriendlyName": "Term Transform",
- "ShortName": null,
+ "Name": "Transforms.MissingValueIndicator",
+ "Desc": "Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.",
+ "FriendlyName": "NA Indicator Transform",
+ "ShortName": "NAInd",
"Inputs": [
{
"Name": "Column",
@@ -19073,57 +19075,6 @@
"ItemType": {
"Kind": "Struct",
"Fields": [
- {
- "Name": "MaxNumTerms",
- "Type": "Int",
- "Desc": "Maximum number of terms to keep when auto-training",
- "Aliases": [
- "max"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Term",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "List of terms",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Sort",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
- },
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "TextKeyValues",
- "Type": "Bool",
- "Desc": "Whether key value metadata should be text, regardless of the actual input type",
- "Aliases": [
- "textkv"
- ],
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
{
"Name": "Name",
"Type": "String",
@@ -19155,10 +19106,9 @@
"Aliases": [
"col"
],
- "Required": false,
+ "Required": true,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
},
{
"Name": "Data",
@@ -19167,57 +19117,6 @@
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
- },
- {
- "Name": "MaxNumTerms",
- "Type": "Int",
- "Desc": "Maximum number of terms to keep per column when auto-training",
- "Aliases": [
- "max"
- ],
- "Required": false,
- "SortOrder": 5.0,
- "IsNullable": false,
- "Default": 1000000
- },
- {
- "Name": "Term",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "List of terms",
- "Required": false,
- "SortOrder": 106.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "Sort",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Occurrence",
- "Value"
- ]
- },
- "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
- "Required": false,
- "SortOrder": 113.0,
- "IsNullable": false,
- "Default": "Occurrence"
- },
- {
- "Name": "TextKeyValues",
- "Type": "Bool",
- "Desc": "Whether key value metadata should be text, regardless of the actual input type",
- "Aliases": [
- "textkv"
- ],
- "Required": false,
- "SortOrder": 114.0,
- "IsNullable": false,
- "Default": false
}
],
"Outputs": [
@@ -19240,99 +19139,117 @@
]
},
{
- "Name": "Transforms.TrainTestDatasetSplitter",
- "Desc": "Split the dataset into train and test sets",
- "FriendlyName": "Dataset Train-Test Split",
- "ShortName": null,
- "Inputs": [
- {
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
+ "Name": "Transforms.MissingValuesDropper",
+ "Desc": "Removes NAs from vector columns.",
+ "FriendlyName": "NA Drop Transform",
+ "ShortName": "NADrop",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "Columns to drop the NAs for",
+ "Aliases": [
+ "col"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "Fraction",
- "Type": "Float",
- "Desc": "Fraction of training data",
- "Required": false,
- "SortOrder": 2.0,
- "IsNullable": false,
- "Default": 0.8
- },
- {
- "Name": "StratificationColumn",
- "Type": "String",
- "Desc": "Stratification column",
- "Aliases": [
- "strat"
- ],
- "Required": false,
- "SortOrder": 3.0,
- "IsNullable": false,
- "Default": null
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
}
],
"Outputs": [
{
- "Name": "TrainData",
+ "Name": "OutputData",
"Type": "DataView",
- "Desc": "Training data"
+ "Desc": "Transformed dataset"
},
{
- "Name": "TestData",
- "Type": "DataView",
- "Desc": "Testing data"
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
}
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
]
},
{
- "Name": "Transforms.TreeLeafFeaturizer",
- "Desc": "Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.",
- "FriendlyName": "Tree Ensemble Featurization Transform",
- "ShortName": "TreeFeat",
+ "Name": "Transforms.MissingValuesRowDropper",
+ "Desc": "Filters out rows that contain missing values.",
+ "FriendlyName": "NA Filter",
+ "ShortName": "NAFilter",
"Inputs": [
{
- "Name": "Data",
- "Type": "DataView",
- "Desc": "Input dataset",
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Column",
+ "Aliases": [
+ "col"
+ ],
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "Trainer to use",
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
"Required": true,
- "SortOrder": 10.0,
+ "SortOrder": 1.0,
"IsNullable": false
},
{
- "Name": "Suffix",
- "Type": "String",
- "Desc": "Output column: The suffix to append to the default column names",
- "Aliases": [
- "ex"
- ],
- "Required": false,
- "SortOrder": 101.0,
- "IsNullable": false,
- "Default": null
- },
- {
- "Name": "LabelPermutationSeed",
- "Type": "Int",
- "Desc": "If specified, determines the permutation seed for applying this featurizer to a multiclass problem.",
- "Aliases": [
- "lps"
- ],
+ "Name": "Complement",
+ "Type": "Bool",
+ "Desc": "If true, keep only rows that contain NA values, and filter the rest.",
"Required": false,
- "SortOrder": 102.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 0
+ "Default": false
}
],
"Outputs": [
@@ -19348,7 +19265,6 @@
}
],
"InputKind": [
- "IFeaturizerInput",
"ITransformInput"
],
"OutputKind": [
@@ -19356,41 +19272,10 @@
]
},
{
- "Name": "Transforms.TwoHeterogeneousModelCombiner",
- "Desc": "Combines a TransformModel and a PredictorModel into a single PredictorModel.",
- "FriendlyName": null,
- "ShortName": null,
- "Inputs": [
- {
- "Name": "TransformModel",
- "Type": "TransformModel",
- "Desc": "Transform model",
- "Required": true,
- "SortOrder": 1.0,
- "IsNullable": false
- },
- {
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "Predictor model",
- "Required": true,
- "SortOrder": 2.0,
- "IsNullable": false
- }
- ],
- "Outputs": [
- {
- "Name": "PredictorModel",
- "Type": "PredictorModel",
- "Desc": "Predictor model"
- }
- ]
- },
- {
- "Name": "Transforms.WordTokenizer",
- "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.",
- "FriendlyName": "Tokenize Text Transform",
- "ShortName": "TokenizeTextTransform",
+ "Name": "Transforms.MissingValueSubstitutor",
+ "Desc": "Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).",
+ "FriendlyName": "NA Replace Transform",
+ "ShortName": "NARep",
"Inputs": [
{
"Name": "Column",
@@ -19400,17 +19285,44 @@
"Kind": "Struct",
"Fields": [
{
- "Name": "TermSeparators",
+ "Name": "ReplacementString",
"Type": "String",
- "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.",
+ "Desc": "Replacement value for NAs (uses default value if not given)",
"Aliases": [
- "sep"
+ "rep"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": null
},
+ {
+ "Name": "Kind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultValue",
+ "Mean",
+ "Minimum",
+ "Maximum",
+ "SpecifiedValue"
+ ]
+ },
+ "Desc": "The replacement method to utilize",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Slot",
+ "Type": "Bool",
+ "Desc": "Whether to impute values by slot",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
{
"Name": "Name",
"Type": "String",
@@ -19438,14 +19350,13 @@
]
}
},
- "Desc": "New column definition(s)",
+ "Desc": "New column definition(s) (optional form: name:rep:src)",
"Aliases": [
"col"
],
- "Required": false,
+ "Required": true,
"SortOrder": 1.0,
- "IsNullable": false,
- "Default": null
+ "IsNullable": false
},
{
"Name": "Data",
@@ -19456,16 +19367,37 @@
"IsNullable": false
},
{
- "Name": "TermSeparators",
- "Type": "String",
- "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.",
+ "Name": "ReplacementKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "DefaultValue",
+ "Mean",
+ "Minimum",
+ "Maximum",
+ "SpecifiedValue"
+ ]
+ },
+ "Desc": "The replacement method to utilize",
"Aliases": [
- "sep"
+ "kind"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": "space"
+ "Default": "Default"
+ },
+ {
+ "Name": "ImputeBySlot",
+ "Type": "Bool",
+ "Desc": "Whether to impute values by slot",
+ "Aliases": [
+ "slot"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
}
],
"Outputs": [
@@ -19486,201 +19418,2748 @@
"OutputKind": [
"ITransformOutput"
]
- }
- ],
- "Components": [
+ },
{
- "Kind": "AutoMlEngine",
- "Components": [
+ "Name": "Transforms.ModelCombiner",
+ "Desc": "Combines a sequence of TransformModels into a single model",
+ "FriendlyName": null,
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Models",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "TransformModel"
+ },
+ "Desc": "Input models",
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputModel",
+ "Type": "TransformModel",
+ "Desc": "Combined model"
+ }
+ ]
+ },
+ {
+ "Name": "Transforms.NGramTranslator",
+ "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.",
+ "FriendlyName": "NGram Transform",
+ "ShortName": "NgramTransform",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "NgramLength",
+ "Type": "Int",
+ "Desc": "Maximum ngram length",
+ "Aliases": [
+ "ngram"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "AllLengths",
+ "Type": "Bool",
+ "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength",
+ "Aliases": [
+ "all"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "SkipLength",
+ "Type": "Int",
+ "Desc": "Maximum number of tokens to skip when constructing an ngram",
+ "Aliases": [
+ "skips"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "MaxNumTerms",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "Int"
+ },
+ "Desc": "Maximum number of ngrams to store in the dictionary",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Weighting",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Tf",
+ "Idf",
+ "TfIdf"
+ ]
+ },
+ "Desc": "Statistical measure used to evaluate how important a word is to a document in a corpus",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "NgramLength",
+ "Type": "Int",
+ "Desc": "Maximum ngram length",
+ "Aliases": [
+ "ngram"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 2
+ },
+ {
+ "Name": "AllLengths",
+ "Type": "Bool",
+ "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength",
+ "Aliases": [
+ "all"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "SkipLength",
+ "Type": "Int",
+ "Desc": "Maximum number of tokens to skip when constructing an ngram",
+ "Aliases": [
+ "skips"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0
+ },
+ {
+ "Name": "MaxNumTerms",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "Int"
+ },
+ "Desc": "Maximum number of ngrams to store in the dictionary",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": [
+ 10000000
+ ]
+ },
+ {
+ "Name": "Weighting",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Tf",
+ "Idf",
+ "TfIdf"
+ ]
+ },
+ "Desc": "The weighting criteria",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "Tf"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.NoOperation",
+ "Desc": "Does nothing.",
+ "FriendlyName": "No Op",
+ "ShortName": "Nop",
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.OptionalColumnCreator",
+ "Desc": "If the source column does not exist after deserialization, create a column with the right type and default values.",
+ "FriendlyName": "Optional Column Transform",
+ "ShortName": "optional",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "New column definition(s)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.PcaCalculator",
+ "Desc": "Train an PCA Anomaly model.",
+ "FriendlyName": "Principal Component Analysis Transform",
+ "ShortName": "Pca",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "The name of the weight column",
+ "Aliases": [
+ "weight"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Rank",
+ "Type": "Int",
+ "Desc": "The number of components in the PCA",
+ "Aliases": [
+ "k"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Oversampling",
+ "Type": "Int",
+ "Desc": "Oversampling parameter for randomized PCA training",
+ "Aliases": [
+ "over"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Center",
+ "Type": "Bool",
+ "Desc": "If enabled, data is centered to be zero mean",
+ "Aliases": [
+ "center"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Seed",
+ "Type": "Int",
+ "Desc": "The seed for random number generation",
+ "Aliases": [
+ "seed"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "WeightColumn",
+ "Type": "String",
+ "Desc": "The name of the weight column",
+ "Aliases": [
+ "weight"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Rank",
+ "Type": "Int",
+ "Desc": "The number of components in the PCA",
+ "Aliases": [
+ "k"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 20
+ },
+ {
+ "Name": "Oversampling",
+ "Type": "Int",
+ "Desc": "Oversampling parameter for randomized PCA training",
+ "Aliases": [
+ "over"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 20
+ },
+ {
+ "Name": "Center",
+ "Type": "Bool",
+ "Desc": "If enabled, data is centered to be zero mean",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "Seed",
+ "Type": "Int",
+ "Desc": "The seed for random number generation",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.PredictedLabelColumnOriginalValueConverter",
+ "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.",
+ "FriendlyName": "Convert Predicted Label",
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "PredictedLabelColumn",
+ "Type": "String",
+ "Desc": "The predicted label column",
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.RandomNumberGenerator",
+ "Desc": "Adds a column with a generated number sequence.",
+ "FriendlyName": "Generate Number Transform",
+ "ShortName": "Generate",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "UseCounter",
+ "Type": "Bool",
+ "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Aliases": [
+ "cnt"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "The random seed",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:seed)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "UseCounter",
+ "Type": "Bool",
+ "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Aliases": [
+ "cnt"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "The random seed",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 42
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.RowRangeFilter",
+ "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.",
+ "FriendlyName": "Range Filter",
+ "ShortName": "RangeFilter",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": "String",
+ "Desc": "Column",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Min",
+ "Type": "Float",
+ "Desc": "Minimum value (0 to 1 for key types)",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Max",
+ "Type": "Float",
+ "Desc": "Maximum value (0 to 1 for key types)",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Complement",
+ "Type": "Bool",
+ "Desc": "If true, keep the values that fall outside the range.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "IncludeMin",
+ "Type": "Bool",
+ "Desc": "If true, include in the range the values that are equal to min.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "IncludeMax",
+ "Type": "Bool",
+ "Desc": "If true, include in the range the values that are equal to max.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.RowSkipAndTakeFilter",
+ "Desc": "Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging.",
+ "FriendlyName": "Skip and Take Filter",
+ "ShortName": "SkipTake",
+ "Inputs": [
+ {
+ "Name": "Skip",
+ "Type": "Int",
+ "Desc": "Number of items to skip",
+ "Aliases": [
+ "s"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Take",
+ "Type": "Int",
+ "Desc": "Number of items to take",
+ "Aliases": [
+ "t"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": true,
+ "Default": null
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.RowSkipFilter",
+ "Desc": "Allows limiting input to a subset of rows by skipping a number of rows.",
+ "FriendlyName": "Skip Filter",
+ "ShortName": "Skip",
+ "Inputs": [
+ {
+ "Name": "Count",
+ "Type": "Int",
+ "Desc": "Number of items to skip",
+ "Aliases": [
+ "c",
+ "n",
+ "s"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 0
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.RowTakeFilter",
+ "Desc": "Allows limiting input to a subset of rows by taking N first rows.",
+ "FriendlyName": "Take Filter",
+ "ShortName": "Take",
+ "Inputs": [
+ {
+ "Name": "Count",
+ "Type": "Int",
+ "Desc": "Number of items to take",
+ "Aliases": [
+ "c",
+ "n",
+ "t"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 9223372036854775807
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.ScoreColumnSelector",
+ "Desc": "Selects only the last score columns and the extra columns specified in the arguments.",
+ "FriendlyName": "Choose Columns By Index",
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "ExtraColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Extra columns to write",
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.Scorer",
+ "Desc": "Turn the predictor model into a transform model",
+ "FriendlyName": null,
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The predictor model to turn into a transform",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "ScoredData",
+ "Type": "DataView",
+ "Desc": "The scored dataset"
+ },
+ {
+ "Name": "ScoringTransform",
+ "Type": "TransformModel",
+ "Desc": "The scoring transform"
+ }
+ ]
+ },
+ {
+ "Name": "Transforms.Segregator",
+ "Desc": "Un-groups vector columns into sequences of rows, inverse of Group transform",
+ "FriendlyName": "Un-group Transform",
+ "ShortName": "Ungroup",
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Columns to unroll, or 'pivot'",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 150.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Mode",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Inner",
+ "Outer",
+ "First"
+ ]
+ },
+ "Desc": "Specifies how to unroll multiple pivot columns of different size.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "Inner"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.SentimentAnalyzer",
+ "Desc": "Uses a pretrained sentiment model to score input strings",
+ "FriendlyName": "Sentiment Analyzing Transform",
+ "ShortName": "Senti",
+ "Inputs": [
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column.",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column.",
+ "Aliases": [
+ "dst"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.SupervisedBinNormalizer",
+ "Desc": "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins.",
+ "FriendlyName": "Supervised Binning Normalizer",
+ "ShortName": "SupBin",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "NumBins",
+ "Type": "Int",
+ "Desc": "Max number of bins, power of 2 recommended",
+ "Aliases": [
+ "bins"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "FixZero",
+ "Type": "Bool",
+ "Desc": "Whether to map zero to zero, preserving sparsity",
+ "Aliases": [
+ "zero"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "LabelColumn",
+ "Type": "String",
+ "Desc": "Label column for supervised binning",
+ "Aliases": [
+ "label",
+ "lab"
+ ],
+ "Required": true,
+ "SortOrder": 150.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "MinBinSize",
+ "Type": "Int",
+ "Desc": "Minimum number of examples per bin",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 10
+ },
+ {
+ "Name": "NumBins",
+ "Type": "Int",
+ "Desc": "Max number of bins, power of 2 recommended",
+ "Aliases": [
+ "bins"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1024
+ },
+ {
+ "Name": "FixZero",
+ "Type": "Bool",
+ "Desc": "Whether to map zero to zero, preserving sparsity",
+ "Aliases": [
+ "zero"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "MaxTrainingExamples",
+ "Type": "Int",
+ "Desc": "Max number of examples used to train the normalizer",
+ "Aliases": [
+ "maxtrain"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000000
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.TextFeaturizer",
+ "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.",
+ "FriendlyName": "Text Transform",
+ "ShortName": "Text",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ },
+ "Desc": "New column definition (optional form: name:srcs).",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Language",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "English",
+ "French",
+ "German",
+ "Dutch",
+ "Italian",
+ "Spanish",
+ "Japanese"
+ ]
+ },
+ "Desc": "Dataset language or 'AutoDetect' to detect language per row.",
+ "Aliases": [
+ "lang"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "English"
+ },
+ {
+ "Name": "StopWordsRemover",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "StopWordsRemover"
+ },
+ "Desc": "Stopwords remover.",
+ "Aliases": [
+ "remover"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "TextCase",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Lower",
+ "Upper",
+ "None"
+ ]
+ },
+ "Desc": "Casing text using the rules of the invariant culture.",
+ "Aliases": [
+ "case"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "Lower"
+ },
+ {
+ "Name": "KeepDiacritics",
+ "Type": "Bool",
+ "Desc": "Whether to keep diacritical marks or remove them.",
+ "Aliases": [
+ "diac"
+ ],
+ "Required": false,
+ "SortOrder": 6.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "KeepPunctuations",
+ "Type": "Bool",
+ "Desc": "Whether to keep punctuation marks or remove them.",
+ "Aliases": [
+ "punc"
+ ],
+ "Required": false,
+ "SortOrder": 7.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "KeepNumbers",
+ "Type": "Bool",
+ "Desc": "Whether to keep numbers or remove them.",
+ "Aliases": [
+ "num"
+ ],
+ "Required": false,
+ "SortOrder": 8.0,
+ "IsNullable": false,
+ "Default": true
+ },
+ {
+ "Name": "OutputTokens",
+ "Type": "Bool",
+ "Desc": "Whether to output the transformed text tokens as an additional column.",
+ "Aliases": [
+ "tokens",
+ "showtext",
+ "showTransformedText"
+ ],
+ "Required": false,
+ "SortOrder": 9.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "Dictionary",
+ "Type": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Term",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of terms",
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "Occurrence"
+ },
+ {
+ "Name": "DropUnknowns",
+ "Type": "Bool",
+ "Desc": "Drop unknown terms instead of mapping them to NA term.",
+ "Aliases": [
+ "dropna"
+ ],
+ "Required": false,
+ "SortOrder": 6.0,
+ "IsNullable": false,
+ "Default": false
+ }
+ ]
+ },
+ "Desc": "A dictionary of whitelisted terms.",
+ "Aliases": [
+ "dict"
+ ],
+ "Required": false,
+ "SortOrder": 10.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "WordFeatureExtractor",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "NgramExtractor"
+ },
+ "Desc": "Ngram feature extractor to use for words (WordBag/WordHashBag).",
+ "Aliases": [
+ "wordExtractor"
+ ],
+ "Required": false,
+ "SortOrder": 11.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "NGram",
+ "Settings": {
+ "MaxNumTerms": [
+ 10000000
+ ]
+ }
+ }
+ },
+ {
+ "Name": "CharFeatureExtractor",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "NgramExtractor"
+ },
+ "Desc": "Ngram feature extractor to use for characters (WordBag/WordHashBag).",
+ "Aliases": [
+ "charExtractor"
+ ],
+ "Required": false,
+ "SortOrder": 12.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "NGram",
+ "Settings": {
+ "NgramLength": 3,
+ "AllLengths": false,
+ "MaxNumTerms": [
+ 10000000
+ ]
+ }
+ }
+ },
+ {
+ "Name": "VectorNormalizer",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "None",
+ "L1",
+ "L2",
+ "LInf"
+ ]
+ },
+ "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.",
+ "Aliases": [
+ "norm"
+ ],
+ "Required": false,
+ "SortOrder": 13.0,
+ "IsNullable": false,
+ "Default": "L2"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.TextToKeyConverter",
+ "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.",
+ "FriendlyName": "Term Transform",
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "MaxNumTerms",
+ "Type": "Int",
+ "Desc": "Maximum number of terms to keep when auto-training",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Term",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of terms",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "TextKeyValues",
+ "Type": "Bool",
+ "Desc": "Whether key value metadata should be text, regardless of the actual input type",
+ "Aliases": [
+ "textkv"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "MaxNumTerms",
+ "Type": "Int",
+ "Desc": "Maximum number of terms to keep per column when auto-training",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": 1000000
+ },
+ {
+ "Name": "Term",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of terms",
+ "Required": false,
+ "SortOrder": 106.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Sort",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Occurrence",
+ "Value"
+ ]
+ },
+ "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').",
+ "Required": false,
+ "SortOrder": 113.0,
+ "IsNullable": false,
+ "Default": "Occurrence"
+ },
+ {
+ "Name": "TextKeyValues",
+ "Type": "Bool",
+ "Desc": "Whether key value metadata should be text, regardless of the actual input type",
+ "Aliases": [
+ "textkv"
+ ],
+ "Required": false,
+ "SortOrder": 114.0,
+ "IsNullable": false,
+ "Default": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.TrainTestDatasetSplitter",
+ "Desc": "Split the dataset into train and test sets",
+ "FriendlyName": "Dataset Train-Test Split",
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Fraction",
+ "Type": "Float",
+ "Desc": "Fraction of training data",
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": 0.8
+ },
+ {
+ "Name": "StratificationColumn",
+ "Type": "String",
+ "Desc": "Stratification column",
+ "Aliases": [
+ "strat"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "TrainData",
+ "Type": "DataView",
+ "Desc": "Training data"
+ },
+ {
+ "Name": "TestData",
+ "Type": "DataView",
+ "Desc": "Testing data"
+ }
+ ]
+ },
+ {
+ "Name": "Transforms.TreeLeafFeaturizer",
+ "Desc": "Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.",
+ "FriendlyName": "Tree Ensemble Featurization Transform",
+ "ShortName": "TreeFeat",
+ "Inputs": [
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "Trainer to use",
+ "Required": true,
+ "SortOrder": 10.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Suffix",
+ "Type": "String",
+ "Desc": "Output column: The suffix to append to the default column names",
+ "Aliases": [
+ "ex"
+ ],
+ "Required": false,
+ "SortOrder": 101.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "LabelPermutationSeed",
+ "Type": "Int",
+ "Desc": "If specified, determines the permutation seed for applying this featurizer to a multiclass problem.",
+ "Aliases": [
+ "lps"
+ ],
+ "Required": false,
+ "SortOrder": 102.0,
+ "IsNullable": false,
+ "Default": 0
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "IFeaturizerInput",
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.TwoHeterogeneousModelCombiner",
+ "Desc": "Combines a TransformModel and a PredictorModel into a single PredictorModel.",
+ "FriendlyName": null,
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "TransformModel",
+ "Type": "TransformModel",
+ "Desc": "Transform model",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "Predictor model",
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "Predictor model"
+ }
+ ]
+ },
+ {
+ "Name": "Transforms.WordTokenizer",
+ "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.",
+ "FriendlyName": "Tokenize Text Transform",
+ "ShortName": "TokenizeTextTransform",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "TermSeparators",
+ "Type": "String",
+ "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.",
+ "Aliases": [
+ "sep"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "TermSeparators",
+ "Type": "String",
+ "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.",
+ "Aliases": [
+ "sep"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "space"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ }
+ ],
+ "Components": [
+ {
+ "Kind": "AutoMlEngine",
+ "Components": [
+ {
+ "Name": "Defaults",
+ "Desc": "AutoML engine that returns learners with default settings.",
+ "FriendlyName": "Defaults Engine",
+ "Settings": []
+ },
+ {
+ "Name": "Rocket",
+ "Desc": "AutoML engine that consists of distinct, hierarchical stages of operation.",
+ "FriendlyName": "Rocket Engine",
+ "Settings": [
+ {
+ "Name": "TopKLearners",
+ "Type": "Int",
+ "Desc": "Number of learners to retain for second stage.",
+ "Aliases": [
+ "topk"
+ ],
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": 2
+ },
+ {
+ "Name": "SecondRoundTrialsPerLearner",
+ "Type": "Int",
+ "Desc": "Number of trials for retained second stage learners.",
+ "Aliases": [
+ "stage2num"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": 5
+ },
+ {
+ "Name": "RandomInitialization",
+ "Type": "Bool",
+ "Desc": "Use random initialization only.",
+ "Aliases": [
+ "randinit"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "NumInitializationPipelines",
+ "Type": "Int",
+ "Desc": "Number of initilization pipelines, used for random initialization only.",
+ "Aliases": [
+ "numinitseeds"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": 20
+ }
+ ]
+ },
+ {
+ "Name": "UniformRandom",
+ "Desc": "AutoML engine using uniform random sampling.",
+ "FriendlyName": "Uniform Random Engine",
+ "Settings": []
+ }
+ ]
+ },
+ {
+ "Kind": "AutoMlStateBase",
+ "Components": [
+ {
+ "Name": "AutoMlState",
+ "Desc": "State of an AutoML search and search space.",
+ "FriendlyName": "AutoML State",
+ "Aliases": [
+ "automlst"
+ ],
+ "Settings": [
+ {
+ "Name": "Metric",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Auc",
+ "AccuracyMicro",
+ "AccuracyMacro",
+ "L2",
+ "F1",
+ "AuPrc",
+ "TopKAccuracy",
+ "Rms",
+ "LossFn",
+ "RSquared",
+ "LogLoss",
+ "LogLossReduction",
+ "Ndcg",
+ "Dcg",
+ "PositivePrecision",
+ "PositiveRecall",
+ "NegativePrecision",
+ "NegativeRecall",
+ "DrAtK",
+ "DrAtPFpr",
+ "DrAtNumPos",
+ "NumAnomalies",
+ "ThreshAtK",
+ "ThreshAtP",
+ "ThreshAtNumPos",
+ "Nmi",
+ "AvgMinScore",
+ "Dbi"
+ ]
+ },
+ "Desc": "Supported metric for evaluator.",
+ "Aliases": [
+ "metric"
+ ],
+ "Required": true,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "Auc"
+ },
+ {
+ "Name": "Engine",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "AutoMlEngine"
+ },
+ "Desc": "AutoML engine (pipeline optimizer) that generates next candidates.",
+ "Aliases": [
+ "engine"
+ ],
+ "Required": true,
+ "SortOrder": 150.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "TrainerKind",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "SignatureBinaryClassifierTrainer",
+ "SignatureMultiClassClassifierTrainer",
+ "SignatureRankerTrainer",
+ "SignatureRegressorTrainer",
+ "SignatureMultiOutputRegressorTrainer",
+ "SignatureAnomalyDetectorTrainer",
+ "SignatureClusteringTrainer"
+ ]
+ },
+ "Desc": "Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc.",
+ "Aliases": [
+ "tk"
+ ],
+ "Required": true,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": "SignatureBinaryClassifierTrainer"
+ },
+ {
+ "Name": "TerminatorArgs",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "SearchTerminator"
+ },
+ "Desc": "Arguments for creating terminator, which determines when to stop search.",
+ "Aliases": [
+ "term"
+ ],
+ "Required": true,
+ "SortOrder": 150.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "RequestedLearners",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Learner set to sweep over (if available).",
+ "Aliases": [
+ "learners"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "Kind": "BoosterParameterFunction",
+ "Components": [
{
- "Name": "Defaults",
- "Desc": "AutoML engine that returns learners with default settings.",
- "FriendlyName": "Defaults Engine",
- "Settings": []
+ "Name": "dart",
+ "Desc": "Dropouts meet Multiple Additive Regresion Trees. See https://arxiv.org/abs/1505.01866",
+ "FriendlyName": "Tree Dropout Tree Booster",
+ "Settings": [
+ {
+ "Name": "DropRate",
+ "Type": "Float",
+ "Desc": "Drop ratio for trees. Range:(0,1).",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.1,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "MaxDrop",
+ "Type": "Int",
+ "Desc": "Max number of dropped tree in a boosting round.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1,
+ "Range": {
+ "Inf": 0,
+ "Max": 2147483647
+ }
+ },
+ {
+ "Name": "SkipDrop",
+ "Type": "Float",
+ "Desc": "Probability for not perform dropping in a boosting round.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.5,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "XgboostDartMode",
+ "Type": "Bool",
+ "Desc": "True will enable xgboost dart mode.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "UniformDrop",
+ "Type": "Bool",
+ "Desc": "True will enable uniform drop.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "UnbalancedSets",
+ "Type": "Bool",
+ "Desc": "Use for binary classification when classes are not balanced.",
+ "Aliases": [
+ "us"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "MinSplitGain",
+ "Type": "Float",
+ "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "Range": {
+ "Min": 0.0
+ }
+ },
+ {
+ "Name": "MaxDepth",
+ "Type": "Int",
+ "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0,
+ "Range": {
+ "Max": 2147483647,
+ "Min": 0
+ }
+ },
+ {
+ "Name": "MinChildWeight",
+ "Type": "Float",
+ "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.1,
+ "Range": {
+ "Min": 0.0
+ }
+ },
+ {
+ "Name": "SubsampleFreq",
+ "Type": "Int",
+ "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0,
+ "Range": {
+ "Max": 2147483647,
+ "Min": 0
+ }
+ },
+ {
+ "Name": "Subsample",
+ "Type": "Float",
+ "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "FeatureFraction",
+ "Type": "Float",
+ "Desc": "Subsample ratio of columns when constructing each tree. Range: (0,1].",
+ "Aliases": [
+ "ff"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "RegLambda",
+ "Type": "Float",
+ "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.",
+ "Aliases": [
+ "l2"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.01,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.5,
+ 1.0
+ ]
+ }
+ },
+ {
+ "Name": "RegAlpha",
+ "Type": "Float",
+ "Desc": "L1 regularization term on weights, increase this value will make model more conservative.",
+ "Aliases": [
+ "l1"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.5,
+ 1.0
+ ]
+ }
+ },
+ {
+ "Name": "ScalePosWeight",
+ "Type": "Float",
+ "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0
+ }
+ ]
+ },
+ {
+ "Name": "gbdt",
+ "Desc": "Traditional Gradient Boosting Decision Tree.",
+ "FriendlyName": "Tree Booster",
+ "Settings": [
+ {
+ "Name": "UnbalancedSets",
+ "Type": "Bool",
+ "Desc": "Use for binary classification when classes are not balanced.",
+ "Aliases": [
+ "us"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "MinSplitGain",
+ "Type": "Float",
+ "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "Range": {
+ "Min": 0.0
+ }
+ },
+ {
+ "Name": "MaxDepth",
+ "Type": "Int",
+ "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0,
+ "Range": {
+ "Max": 2147483647,
+ "Min": 0
+ }
+ },
+ {
+ "Name": "MinChildWeight",
+ "Type": "Float",
+ "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.1,
+ "Range": {
+ "Min": 0.0
+ }
+ },
+ {
+ "Name": "SubsampleFreq",
+ "Type": "Int",
+ "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0,
+ "Range": {
+ "Max": 2147483647,
+ "Min": 0
+ }
+ },
+ {
+ "Name": "Subsample",
+ "Type": "Float",
+ "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "FeatureFraction",
+ "Type": "Float",
+ "Desc": "Subsample ratio of columns when constructing each tree. Range: (0,1].",
+ "Aliases": [
+ "ff"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "RegLambda",
+ "Type": "Float",
+ "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.",
+ "Aliases": [
+ "l2"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.01,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.5,
+ 1.0
+ ]
+ }
+ },
+ {
+ "Name": "RegAlpha",
+ "Type": "Float",
+ "Desc": "L1 regularization term on weights, increase this value will make model more conservative.",
+ "Aliases": [
+ "l1"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.5,
+ 1.0
+ ]
+ }
+ },
+ {
+ "Name": "ScalePosWeight",
+ "Type": "Float",
+ "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0
+ }
+ ]
},
{
- "Name": "Rocket",
- "Desc": "AutoML engine that consists of distinct, hierarchical stages of operation.",
- "FriendlyName": "Rocket Engine",
+ "Name": "goss",
+ "Desc": "Gradient-based One-Side Sampling.",
+ "FriendlyName": "Gradient-based One-Size Sampling",
"Settings": [
{
- "Name": "TopKLearners",
- "Type": "Int",
- "Desc": "Number of learners to retain for second stage.",
- "Aliases": [
- "topk"
- ],
+ "Name": "TopRate",
+ "Type": "Float",
+ "Desc": "Retain ratio for large gradient instances.",
"Required": false,
- "SortOrder": 1.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 2
+ "Default": 0.2,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
},
{
- "Name": "SecondRoundTrialsPerLearner",
- "Type": "Int",
- "Desc": "Number of trials for retained second stage learners.",
- "Aliases": [
- "stage2num"
- ],
+ "Name": "OtherRate",
+ "Type": "Float",
+ "Desc": "Retain ratio for small gradient instances.",
"Required": false,
- "SortOrder": 2.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 5
+ "Default": 0.1,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
},
{
- "Name": "RandomInitialization",
+ "Name": "UnbalancedSets",
"Type": "Bool",
- "Desc": "Use random initialization only.",
+ "Desc": "Use for binary classification when classes are not balanced.",
"Aliases": [
- "randinit"
+ "us"
],
"Required": false,
- "SortOrder": 3.0,
+ "SortOrder": 150.0,
"IsNullable": false,
"Default": false
},
{
- "Name": "NumInitializationPipelines",
+ "Name": "MinSplitGain",
+ "Type": "Float",
+ "Desc": "Minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 0.0,
+ "Range": {
+ "Min": 0.0
+ }
+ },
+ {
+ "Name": "MaxDepth",
"Type": "Int",
- "Desc": "Number of initilization pipelines, used for random initialization only.",
- "Aliases": [
- "numinitseeds"
- ],
+ "Desc": "Maximum depth of a tree. 0 means no limit. However, tree still grows by best-first.",
"Required": false,
- "SortOrder": 4.0,
+ "SortOrder": 150.0,
"IsNullable": false,
- "Default": 20
- }
- ]
- },
- {
- "Name": "UniformRandom",
- "Desc": "AutoML engine using uniform random sampling.",
- "FriendlyName": "Uniform Random Engine",
- "Settings": []
- }
- ]
- },
- {
- "Kind": "AutoMlStateBase",
- "Components": [
- {
- "Name": "AutoMlState",
- "Desc": "State of an AutoML search and search space.",
- "FriendlyName": "AutoML State",
- "Aliases": [
- "automlst"
- ],
- "Settings": [
+ "Default": 0,
+ "Range": {
+ "Max": 2147483647,
+ "Min": 0
+ }
+ },
{
- "Name": "Metric",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "Auc",
- "AccuracyMicro",
- "AccuracyMacro",
- "L2",
- "F1",
- "AuPrc",
- "TopKAccuracy",
- "Rms",
- "LossFn",
- "RSquared",
- "LogLoss",
- "LogLossReduction",
- "Ndcg",
- "Dcg",
- "PositivePrecision",
- "PositiveRecall",
- "NegativePrecision",
- "NegativeRecall",
- "DrAtK",
- "DrAtPFpr",
- "DrAtNumPos",
- "NumAnomalies",
- "ThreshAtK",
- "ThreshAtP",
- "ThreshAtNumPos",
- "Nmi",
- "AvgMinScore",
- "Dbi"
- ]
- },
- "Desc": "Supported metric for evaluator.",
- "Aliases": [
- "metric"
- ],
- "Required": true,
+ "Name": "MinChildWeight",
+ "Type": "Float",
+ "Desc": "Minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be.",
+ "Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": "Auc"
+ "Default": 0.1,
+ "Range": {
+ "Min": 0.0
+ }
},
{
- "Name": "Engine",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "AutoMlEngine"
- },
- "Desc": "AutoML engine (pipeline optimizer) that generates next candidates.",
- "Aliases": [
- "engine"
- ],
- "Required": true,
+ "Name": "SubsampleFreq",
+ "Type": "Int",
+ "Desc": "Subsample frequency. 0 means no subsample. If subsampleFreq > 0, it will use a subset(ratio=subsample) to train. And the subset will be updated on every Subsample iteratinos.",
+ "Required": false,
"SortOrder": 150.0,
- "IsNullable": false
+ "IsNullable": false,
+ "Default": 0,
+ "Range": {
+ "Max": 2147483647,
+ "Min": 0
+ }
},
{
- "Name": "TrainerKind",
- "Type": {
- "Kind": "Enum",
- "Values": [
- "SignatureBinaryClassifierTrainer",
- "SignatureMultiClassClassifierTrainer",
- "SignatureRankerTrainer",
- "SignatureRegressorTrainer",
- "SignatureMultiOutputRegressorTrainer",
- "SignatureAnomalyDetectorTrainer",
- "SignatureClusteringTrainer"
- ]
- },
- "Desc": "Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc.",
+ "Name": "Subsample",
+ "Type": "Float",
+ "Desc": "Subsample ratio of the training instance. Setting it to 0.5 means that LightGBM randomly collected half of the data instances to grow trees and this will prevent overfitting. Range: (0,1].",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
+ },
+ {
+ "Name": "FeatureFraction",
+ "Type": "Float",
+ "Desc": "Subsample ratio of columns when constructing each tree. Range: (0,1].",
"Aliases": [
- "tk"
+ "ff"
],
- "Required": true,
+ "Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": "SignatureBinaryClassifierTrainer"
+ "Default": 1.0,
+ "Range": {
+ "Inf": 0.0,
+ "Max": 1.0
+ }
},
{
- "Name": "TerminatorArgs",
- "Type": {
- "Kind": "Component",
- "ComponentKind": "SearchTerminator"
- },
- "Desc": "Arguments for creating terminator, which determines when to stop search.",
+ "Name": "RegLambda",
+ "Type": "Float",
+ "Desc": "L2 regularization term on weights, increasing this value will make model more conservative.",
"Aliases": [
- "term"
+ "l2"
],
- "Required": true,
+ "Required": false,
"SortOrder": 150.0,
- "IsNullable": false
+ "IsNullable": false,
+ "Default": 0.01,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.5,
+ 1.0
+ ]
+ }
},
{
- "Name": "RequestedLearners",
- "Type": {
- "Kind": "Array",
- "ItemType": "String"
- },
- "Desc": "Learner set to sweep over (if available).",
+ "Name": "RegAlpha",
+ "Type": "Float",
+ "Desc": "L1 regularization term on weights, increase this value will make model more conservative.",
"Aliases": [
- "learners"
+ "l1"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": null
+ "Default": 0.0,
+ "Range": {
+ "Min": 0.0
+ },
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.0,
+ 0.5,
+ 1.0
+ ]
+ }
+ },
+ {
+ "Name": "ScalePosWeight",
+ "Type": "Float",
+ "Desc": "Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative cases) / sum(positive cases).",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1.0
}
]
}
@@ -24490,6 +26969,17 @@
}
]
},
+ {
+ "Kind": "ParallelLightGBM",
+ "Components": [
+ {
+ "Name": "Single",
+ "Desc": "Single node machine learning process.",
+ "FriendlyName": "Single",
+ "Settings": []
+ }
+ ]
+ },
{
"Kind": "ParallelTraining",
"Components": [
diff --git a/test/Directory.Build.targets b/test/Directory.Build.targets
index c497a50172..88f693b838 100644
--- a/test/Directory.Build.targets
+++ b/test/Directory.Build.targets
@@ -1,34 +1,5 @@
-
-
-
-
- lib
- .dll
- .so
- .dylib
-
-
-
-
- $(NativeOutputPath)$(LibPrefix)%(NativeAssemblyReference.Identity)$(LibExtension)
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj
index 3cf404f981..bed3dce0eb 100644
--- a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj
+++ b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj
@@ -10,6 +10,7 @@
+
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
index 684dabfbbd..de0db6f3a9 100644
--- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
+++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -1803,6 +1803,18 @@ public void EntryPointEvaluateRanking()
}
}
+ [Fact]
+ public void EntryPointLightGbmBinary()
+ {
+ TestEntryPointRoutine("breast-cancer.txt", "Trainers.LightGbmBinaryClassifier");
+ }
+
+ [Fact]
+ public void EntryPointLightGbmMultiClass()
+ {
+ TestEntryPointRoutine(GetDataPath(@"iris.txt"), "Trainers.LightGbmClassifier");
+ }
+
[Fact]
public void EntryPointSdcaBinary()
{
diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs
index 26ba5d118e..928f740b57 100644
--- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs
+++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs
@@ -407,7 +407,7 @@ public void FastTreeBinaryClassificationTest()
});
Done();
}
-
+
[Fact]
[TestCategory("Binary")]
[TestCategory("LightGBM")]
diff --git a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs
index 1ebc2489ec..b104570ca7 100644
--- a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs
@@ -32,6 +32,18 @@ public void TrainAndPredictSentimentModelTest()
ValidateBinaryMetrics(metrics);
}
+ [Fact]
+ public void TrainAndPredictLightGBMSentimentModelTest()
+ {
+ var pipeline = PreparePipelineLightGBM();
+ var model = pipeline.Train();
+ var testData = PrepareTextLoaderTestData();
+ var evaluator = new BinaryClassificationEvaluator();
+ var metrics = evaluator.Evaluate(model, testData);
+ ValidateExamplesLightGBM(model);
+ ValidateBinaryMetricsLightGBM(metrics);
+ }
+
[Fact]
public void TrainTestPredictSentimentModelTest()
{
@@ -163,8 +175,42 @@ public void CrossValidateSentimentModelTest()
Assert.True(predictions.ElementAt(1).Sentiment.IsTrue);
}
+ private void ValidateBinaryMetricsLightGBM(BinaryClassificationMetrics metrics)
+ {
+
+ Assert.Equal(.6111, metrics.Accuracy, 4);
+ Assert.Equal(.8, metrics.Auc, 1);
+ Assert.Equal(.85, metrics.Auprc, 2);
+ Assert.Equal(1, metrics.Entropy, 3);
+ Assert.Equal(.72, metrics.F1Score, 4);
+ Assert.Equal(.952, metrics.LogLoss, 3);
+ Assert.Equal(4.777, metrics.LogLossReduction, 3);
+ Assert.Equal(1, metrics.NegativePrecision, 3);
+ Assert.Equal(.222, metrics.NegativeRecall, 3);
+ Assert.Equal(.562, metrics.PositivePrecision, 3);
+ Assert.Equal(1, metrics.PositiveRecall);
+
+ var matrix = metrics.ConfusionMatrix;
+ Assert.Equal(2, matrix.Order);
+ Assert.Equal(2, matrix.ClassNames.Count);
+ Assert.Equal("positive", matrix.ClassNames[0]);
+ Assert.Equal("negative", matrix.ClassNames[1]);
+
+ Assert.Equal(9, matrix[0, 0]);
+ Assert.Equal(9, matrix["positive", "positive"]);
+ Assert.Equal(0, matrix[0, 1]);
+ Assert.Equal(0, matrix["positive", "negative"]);
+
+ Assert.Equal(7, matrix[1, 0]);
+ Assert.Equal(7, matrix["negative", "positive"]);
+ Assert.Equal(2, matrix[1, 1]);
+ Assert.Equal(2, matrix["negative", "negative"]);
+
+ }
+
private void ValidateBinaryMetrics(BinaryClassificationMetrics metrics)
{
+
Assert.Equal(.5556, metrics.Accuracy, 4);
Assert.Equal(.8, metrics.Auc, 1);
Assert.Equal(.87, metrics.Auprc, 2);
@@ -236,18 +282,81 @@ private LearningPipeline PreparePipeline()
WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
});
+
pipeline.Add(new FastTreeBinaryClassifier() { NumLeaves = 5, NumTrees = 5, MinDocumentsInLeafs = 2 });
+
+ pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
+ return pipeline;
+ }
+
+ private LearningPipeline PreparePipelineLightGBM()
+ {
+ var dataPath = GetDataPath(SentimentDataPath);
+ var pipeline = new LearningPipeline();
+
+ pipeline.Add(new Data.TextLoader(dataPath)
+ {
+ Arguments = new TextLoaderArguments
+ {
+ Separator = new[] { '\t' },
+ HasHeader = true,
+ Column = new[]
+ {
+ new TextLoaderColumn()
+ {
+ Name = "Label",
+ Source = new [] { new TextLoaderRange(0) },
+ Type = Data.DataKind.Num
+ },
+
+ new TextLoaderColumn()
+ {
+ Name = "SentimentText",
+ Source = new [] { new TextLoaderRange(1) },
+ Type = Data.DataKind.Text
+ }
+ }
+ }
+ });
+
+ pipeline.Add(new TextFeaturizer("Features", "SentimentText")
+ {
+ KeepDiacritics = false,
+ KeepPunctuations = false,
+ TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
+ OutputTokens = true,
+ StopWordsRemover = new PredefinedStopWordsRemover(),
+ VectorNormalizer = TextTransformTextNormKind.L2,
+ CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
+ WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 2, AllLengths = true }
+ });
+
+
+ pipeline.Add(new LightGbmBinaryClassifier() { NumLeaves = 5, NumBoostRound = 5, MinDataPerLeaf = 2 });
+
pipeline.Add(new PredictedLabelColumnOriginalValueConverter() { PredictedLabelColumn = "PredictedLabel" });
return pipeline;
}
- private void ValidateExamples(PredictionModel model)
+ private void ValidateExamples(PredictionModel model, bool useLightGBM = false)
{
var sentiments = GetTestData();
var predictions = model.Predict(sentiments);
Assert.Equal(2, predictions.Count());
+
Assert.True(predictions.ElementAt(0).Sentiment.IsFalse);
Assert.True(predictions.ElementAt(1).Sentiment.IsTrue);
+
+ }
+
+ private void ValidateExamplesLightGBM(PredictionModel model)
+ {
+ var sentiments = GetTestData();
+ var predictions = model.Predict(sentiments);
+ Assert.Equal(2, predictions.Count());
+
+ Assert.True(predictions.ElementAt(0).Sentiment.IsTrue);
+ Assert.True(predictions.ElementAt(1).Sentiment.IsTrue);
}
private Data.TextLoader PrepareTextLoaderTestData()