From fe889f95f7f9a1646d8e39759ff97376c2274320 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Mon, 18 Jun 2018 15:16:54 -0700 Subject: [PATCH 01/20] Introduce Ensemble codebase --- Microsoft.ML.sln | 7 + src/Microsoft.ML.Ensemble/Batch.cs | 22 + src/Microsoft.ML.Ensemble/EnsembleUtils.cs | 114 ++ .../EntryPoints/CreateEnsemble.cs | 418 +++++++ .../EntryPoints/PipelineEnsemble.cs | 56 + .../FeatureSubsetModel.cs | 33 + .../Microsoft.ML.Ensemble.csproj | 15 + .../OutputCombiners/Average.cs | 62 + .../OutputCombiners/BaseAverager.cs | 78 ++ .../OutputCombiners/BaseMultiAverager.cs | 68 ++ .../OutputCombiners/BaseMultiCombiner.cs | 109 ++ .../OutputCombiners/BaseScalarStacking.cs | 35 + .../OutputCombiners/BaseStacking.cs | 199 ++++ .../OutputCombiners/IOutputCombiner.cs | 54 + .../OutputCombiners/Median.cs | 87 ++ .../OutputCombiners/MultiAverage.cs | 65 + .../OutputCombiners/MultiMedian.cs | 95 ++ .../OutputCombiners/MultiStacking.cs | 96 ++ .../OutputCombiners/MultiVoting.cs | 105 ++ .../OutputCombiners/MultiWeightedAverage.cs | 100 ++ .../OutputCombiners/RegressionStacking.cs | 65 + .../OutputCombiners/Stacking.cs | 66 ++ .../OutputCombiners/Voting.cs | 93 ++ .../OutputCombiners/WeightedAverage.cs | 107 ++ src/Microsoft.ML.Ensemble/PipelineEnsemble.cs | 753 ++++++++++++ .../BaseDisagreementDiversityMeasure.cs | 45 + .../DisagreementDiversityMeasure.cs | 25 + .../DiversityMeasure/ModelDiversityMetric.cs | 15 + .../MultiDisagreementDiversityMeasure.cs | 26 + .../RegressionDisagreementDiversityMeasure.cs | 24 + .../FeatureSelector/AllFeatureSelector.cs | 28 + .../FeatureSelector/RandomFeatureSelector.cs | 58 + .../Selector/IDiversityMeasure.cs | 18 + .../Selector/IFeatureSelector.cs | 15 + .../Selector/ISubModelSelector.cs | 29 + .../Selector/ISubsetSelector.cs | 20 + .../Selector/SubModelSelector/AllSelector.cs | 28 + .../SubModelSelector/AllSelectorMultiClass.cs | 30 + .../BaseBestPerformanceSelector.cs | 124 ++ .../SubModelSelector/BaseDiverseSelector.cs | 144 +++ .../SubModelSelector/BaseSubModelSelector.cs | 137 +++ .../BestDiverseSelectorBinary.cs | 46 + .../BestDiverseSelectorMultiClass.cs | 50 + .../BestDiverseSelectorRegression.cs | 46 + .../BestPerformanceRegressionSelector.cs | 57 + .../BestPerformanceSelector.cs | 58 + .../BestPerformanceSelectorMultiClass.cs | 59 + .../SubModelSelector/SubModelDataSelector.cs | 44 + .../SubsetSelector/AllInstanceSelector.cs | 35 + .../SubsetSelector/BaseSubsetSelector.cs | 105 ++ .../SubsetSelector/BootstrapSelector.cs | 49 + .../SubsetSelector/RandomPartitionSelector.cs | 48 + src/Microsoft.ML.Ensemble/Subset.cs | 22 + .../Trainer/Binary/EnsembleTrainer.cs | 91 ++ .../Trainer/EnsembleDistributionPredictor.cs | 232 ++++ .../Trainer/EnsemblePredictor.cs | 148 +++ .../Trainer/EnsemblePredictorBase.cs | 167 +++ .../Trainer/EnsembleTrainerBase.cs | 253 ++++ .../Multiclass/EnsembleMultiClassPredictor.cs | 152 +++ .../MulticlassDataPartitionEnsembleTrainer.cs | 74 ++ .../Regression/RegressionEnsembleTrainer.cs | 73 ++ src/Microsoft.ML.Ensemble/WeightedValue.cs | 14 + src/Microsoft.ML/CSharpApi.cs | 331 ++++++ .../Common/EntryPoints/core_ep-list.tsv | 7 + .../Common/EntryPoints/core_manifest.json | 370 ++++++ .../Microsoft.ML.Core.Tests.csproj | 1 + .../UnitTests/TestEntryPoints.cs | 1042 +++++++++++++---- .../Microsoft.ML.Tests.csproj | 1 + test/data/lm.sample.txt | 120 ++ 69 files changed, 7164 insertions(+), 199 deletions(-) create mode 100644 src/Microsoft.ML.Ensemble/Batch.cs create mode 100644 src/Microsoft.ML.Ensemble/EnsembleUtils.cs create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs create mode 100644 src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs create mode 100644 src/Microsoft.ML.Ensemble/Microsoft.ML.Ensemble.csproj create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/BaseAverager.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs create mode 100644 src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs create mode 100644 src/Microsoft.ML.Ensemble/PipelineEnsemble.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/DisagreementDiversityMeasure.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/ModelDiversityMetric.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/MultiDisagreementDiversityMeasure.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/RegressionDisagreementDiversityMeasure.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/Subset.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs create mode 100644 src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs create mode 100644 src/Microsoft.ML.Ensemble/WeightedValue.cs create mode 100644 test/data/lm.sample.txt diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 3529c0e5b7..0d4f964b3d 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -106,6 +106,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Maml", "src\Mi EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Console", "src\Microsoft.ML.Console\Microsoft.ML.Console.csproj", "{362A98CF-FBF7-4EBB-A11B-990BBF845B15}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Ensemble", "src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj", "{BD1AB597-7943-47CA-8DC3-88FDB885F8CF}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -208,6 +210,10 @@ Global {362A98CF-FBF7-4EBB-A11B-990BBF845B15}.Debug|Any CPU.Build.0 = Debug|Any CPU {362A98CF-FBF7-4EBB-A11B-990BBF845B15}.Release|Any CPU.ActiveCfg = Release|Any CPU {362A98CF-FBF7-4EBB-A11B-990BBF845B15}.Release|Any CPU.Build.0 = Release|Any CPU + {BD1AB597-7943-47CA-8DC3-88FDB885F8CF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BD1AB597-7943-47CA-8DC3-88FDB885F8CF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BD1AB597-7943-47CA-8DC3-88FDB885F8CF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BD1AB597-7943-47CA-8DC3-88FDB885F8CF}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -243,6 +249,7 @@ Global {7A9DB75F-2CA5-4184-9EF5-1F17EB39483F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} {64F40A0D-D4C2-4AA7-8470-E9CC437827E4} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {362A98CF-FBF7-4EBB-A11B-990BBF845B15} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {BD1AB597-7943-47CA-8DC3-88FDB885F8CF} = {09EADF06-BE25-4228-AB53-95AE3E15B530} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/src/Microsoft.ML.Ensemble/Batch.cs b/src/Microsoft.ML.Ensemble/Batch.cs new file mode 100644 index 0000000000..e9c8fcf179 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Batch.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; + +namespace Microsoft.ML.Runtime.Ensemble +{ + public sealed class Batch + { + public readonly RoleMappedData TrainInstances; + public readonly RoleMappedData TestInstances; + + public Batch(RoleMappedData trainData, RoleMappedData testData) + { + Contracts.CheckValue(trainData, nameof(trainData)); + Contracts.CheckValue(testData, nameof(testData)); + TrainInstances = trainData; + TestInstances = testData; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/EnsembleUtils.cs b/src/Microsoft.ML.Ensemble/EnsembleUtils.cs new file mode 100644 index 0000000000..088a7a9c96 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EnsembleUtils.cs @@ -0,0 +1,114 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Utilities; + +namespace Microsoft.ML.Runtime.Ensemble +{ + internal static class EnsembleUtils + { + /// + /// Return a dataset with non-selected features zeroed out. + /// + public static RoleMappedData SelectFeatures(IHost host, RoleMappedData data, BitArray features) + { + Contracts.AssertValue(host); + Contracts.AssertValue(data); + Contracts.Assert(data.Schema.Feature != null); + Contracts.AssertValue(features); + + var type = data.Schema.Feature.Type; + Contracts.Assert(features.Length == type.VectorSize); + int card = Utils.GetCardinality(features); + if (card == type.VectorSize) + return data; + + // REVIEW shonk: This doesn't preserve metadata on the features column. Should it? + var name = data.Schema.Feature.Name; + var view = LambdaColumnMapper.Create( + host, "FeatureSelector", data.Data, name, name, type, type, + (ref VBuffer src, ref VBuffer dst) => SelectFeatures(ref src, features, card, ref dst)); + + var res = RoleMappedData.Create(view, data.Schema.GetColumnRoleNames()); + return res; + } + + /// + /// Fill dst with values selected from src if the indices of the src values are set in includedIndices, + /// otherwise assign default(T). The length of dst will be equal to src.Length. + /// + public static void SelectFeatures(ref VBuffer src, BitArray includedIndices, int cardinality, ref VBuffer dst) + { + Contracts.Assert(Utils.Size(includedIndices) == src.Length); + Contracts.Assert(cardinality == Utils.GetCardinality(includedIndices)); + Contracts.Assert(cardinality < src.Length); + + var values = dst.Values; + var indices = dst.Indices; + + if (src.IsDense) + { + if (cardinality >= src.Length / 2) + { + T defaultValue = default; + if (Utils.Size(values) < src.Length) + values = new T[src.Length]; + for (int i = 0; i < src.Length; i++) + values[i] = !includedIndices[i] ? defaultValue : src.Values[i]; + dst = new VBuffer(src.Length, values, indices); + } + else + { + if (Utils.Size(values) < cardinality) + values = new T[cardinality]; + if (Utils.Size(indices) < cardinality) + indices = new int[cardinality]; + + int count = 0; + for (int i = 0; i < src.Length; i++) + { + if (includedIndices[i]) + { + Contracts.Assert(count < cardinality); + values[count] = src.Values[i]; + indices[count] = i; + count++; + } + } + + Contracts.Assert(count == cardinality); + dst = new VBuffer(src.Length, count, values, indices); + } + } + else + { + int valuesSize = Utils.Size(values); + int indicesSize = Utils.Size(indices); + if (valuesSize < src.Count || indicesSize < src.Count) + { + if (valuesSize < cardinality) + values = new T[cardinality]; + if (indicesSize < cardinality) + indices = new int[cardinality]; + } + + int count = 0; + for (int i = 0; i < src.Count; i++) + { + if (includedIndices[src.Indices[i]]) + { + values[count] = src.Values[i]; + indices[count] = src.Indices[i]; + count++; + } + } + + dst = new VBuffer(src.Length, count, values, indices); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs new file mode 100644 index 0000000000..ee7cb8fcac --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs @@ -0,0 +1,418 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.IO.Compression; +using System.Linq; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Learners; + +[assembly: LoadableClass(typeof(void), typeof(EnsembleCreator), null, typeof(SignatureEntryPointModule), "CreateEnsemble")] + +namespace Microsoft.ML.Runtime.EntryPoints +{ + /// + /// A component to combine given models into an ensemble model. + /// + public static class EnsembleCreator + { + /// + /// These are the combiner options for binary and multi class classifiers. + /// + public enum ClassifierCombiner + { + Median, + Average, + Vote, + } + + /// + /// These are the combiner options for regression and anomaly detection. + /// + public enum ScoreCombiner + { + Median, + Average, + } + + public abstract class PipelineInputBase + { + [Argument(ArgumentType.Required, ShortName = "models", HelpText = "The models to combine into an ensemble", SortOrder = 1)] + public IPredictorModel[] Models; + } + + public abstract class InputBase + { + [Argument(ArgumentType.Required, ShortName = "models", HelpText = "The models to combine into an ensemble", SortOrder = 1)] + public IPredictorModel[] Models; + + [Argument(ArgumentType.AtMostOnce, ShortName = "validate", HelpText = "Whether to validate that all the pipelines are identical", SortOrder = 5)] + public bool ValidatePipelines = true; + } + + public sealed class ClassifierInput : InputBase + { + [Argument(ArgumentType.AtMostOnce, ShortName = "combiner", HelpText = "The combiner used to combine the scores", SortOrder = 2)] + public ClassifierCombiner ModelCombiner = ClassifierCombiner.Median; + } + + public sealed class PipelineClassifierInput : PipelineInputBase + { + [Argument(ArgumentType.AtMostOnce, ShortName = "combiner", HelpText = "The combiner used to combine the scores", SortOrder = 2)] + public ClassifierCombiner ModelCombiner = ClassifierCombiner.Median; + } + + public sealed class RegressionInput : InputBase + { + [Argument(ArgumentType.AtMostOnce, ShortName = "combiner", HelpText = "The combiner used to combine the scores", SortOrder = 2)] + public ScoreCombiner ModelCombiner = ScoreCombiner.Median; + } + + public sealed class PipelineRegressionInput : PipelineInputBase + { + [Argument(ArgumentType.AtMostOnce, ShortName = "combiner", HelpText = "The combiner used to combine the scores", SortOrder = 2)] + public ScoreCombiner ModelCombiner = ScoreCombiner.Median; + } + + public sealed class PipelineAnomalyInput : PipelineInputBase + { + [Argument(ArgumentType.AtMostOnce, ShortName = "combiner", HelpText = "The combiner used to combine the scores", SortOrder = 2)] + public ScoreCombiner ModelCombiner = ScoreCombiner.Average; + } + + private static void GetPipeline(IHostEnvironment env, InputBase input, out IDataView startingData, out RoleMappedData transformedData) + { + Contracts.AssertValue(env); + env.AssertValue(input); + env.AssertNonEmpty(input.Models); + + ISchema inputSchema = null; + startingData = null; + transformedData = null; + byte[][] transformedDataSerialized = null; + string[] transformedDataZipEntryNames = null; + for (int i = 0; i < input.Models.Length; i++) + { + var model = input.Models[i]; + + var inputData = new EmptyDataView(env, model.TransformModel.InputSchema); + model.PrepareData(env, inputData, out RoleMappedData transformedDataCur, out IPredictor pred); + + if (inputSchema == null) + { + env.Assert(i == 0); + inputSchema = model.TransformModel.InputSchema; + startingData = inputData; + transformedData = transformedDataCur; + } + else if (input.ValidatePipelines) + { + using (var ch = env.Start("Validating pipeline")) + { + if (transformedDataSerialized == null) + { + ch.Assert(transformedDataZipEntryNames == null); + SerializeRoleMappedData(env, ch, transformedData, out transformedDataSerialized, + out transformedDataZipEntryNames); + } + CheckSamePipeline(env, ch, transformedDataCur, transformedDataSerialized, transformedDataZipEntryNames); + ch.Done(); + } + } + } + } + + private static IEnumerable> GetWeightedModels(IEnumerable models) + where T : class, IPredictor + { + return models.Select(predictor => new WeightedValue() + { + Value = predictor.Predictor as T, + Weight = 1 + }); + } + + [TlcModule.EntryPoint(Name = "Models.BinaryEnsemble", Desc = "Combine binary classifiers into an ensemble", UserName = EnsembleTrainer.UserNameValue)] + public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHostEnvironment env, ClassifierInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("CombineModels"); + host.CheckValue(input, nameof(input)); + host.CheckNonEmpty(input.Models, nameof(input.Models)); + + GetPipeline(host, input, out IDataView startingData, out RoleMappedData transformedData); + + var args = new EnsembleTrainer.Arguments(); + switch (input.ModelCombiner) + { + case ClassifierCombiner.Median: + args.OutputCombiner = new SubComponent("Median"); + break; + case ClassifierCombiner.Average: + args.OutputCombiner = new SubComponent("Average"); + break; + case ClassifierCombiner.Vote: + args.OutputCombiner = new SubComponent("Voting"); + break; + default: + throw host.Except("Unknown combiner kind"); + } + + var trainer = new EnsembleTrainer(host, args); + var weightedModels = GetWeightedModels>(input.Models); + var ensemble = trainer.CombineModels(weightedModels); + + var predictorModel = new PredictorModel(host, transformedData, startingData, ensemble); + + var output = new CommonOutputs.BinaryClassificationOutput { PredictorModel = predictorModel }; + return output; + } + + [TlcModule.EntryPoint(Name = "Models.RegressionEnsemble", Desc = "Combine regression models into an ensemble", UserName = RegressionEnsembleTrainer.UserNameValue)] + public static CommonOutputs.RegressionOutput CreateRegressionEnsemble(IHostEnvironment env, RegressionInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("CombineModels"); + host.CheckValue(input, nameof(input)); + host.CheckNonEmpty(input.Models, nameof(input.Models)); + + GetPipeline(host, input, out IDataView startingData, out RoleMappedData transformedData); + + var args = new RegressionEnsembleTrainer.Arguments(); + switch (input.ModelCombiner) + { + case ScoreCombiner.Median: + args.OutputCombiner = new SubComponent("Median"); + break; + case ScoreCombiner.Average: + args.OutputCombiner = new SubComponent("Average"); + break; + default: + throw host.Except("Unknown combiner kind"); + } + + var trainer = new RegressionEnsembleTrainer(host, args); + var weightedModels = GetWeightedModels>(input.Models); + var ensemble = trainer.CombineModels(weightedModels); + + var predictorModel = new PredictorModel(host, transformedData, startingData, ensemble); + + var output = new CommonOutputs.RegressionOutput { PredictorModel = predictorModel }; + return output; + } + + [TlcModule.EntryPoint(Name = "Models.BinaryPipelineEnsemble", Desc = "Combine binary classification models into an ensemble")] + public static CommonOutputs.BinaryClassificationOutput CreateBinaryPipelineEnsemble(IHostEnvironment env, PipelineClassifierInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("CombineModels"); + host.CheckValue(input, nameof(input)); + host.CheckNonEmpty(input.Models, nameof(input.Models)); + + IBinaryOutputCombiner combiner; + switch (input.ModelCombiner) + { + case ClassifierCombiner.Median: + combiner = new Median(host); + break; + case ClassifierCombiner.Average: + combiner = new Average(host); + break; + case ClassifierCombiner.Vote: + combiner = new Voting(host); + break; + default: + throw host.Except("Unknown combiner kind"); + } + var ensemble = SchemaBindablePipelineEnsembleBase.Create(host, input.Models, combiner, MetadataUtils.Const.ScoreColumnKind.BinaryClassification); + return CreatePipelineEnsemble(host, input.Models, ensemble); + } + + [TlcModule.EntryPoint(Name = "Models.RegressionPipelineEnsemble", Desc = "Combine regression models into an ensemble")] + public static CommonOutputs.RegressionOutput CreateRegressionPipelineEnsemble(IHostEnvironment env, PipelineRegressionInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("CombineModels"); + host.CheckValue(input, nameof(input)); + host.CheckNonEmpty(input.Models, nameof(input.Models)); + + IRegressionOutputCombiner combiner; + switch (input.ModelCombiner) + { + case ScoreCombiner.Median: + combiner = new Median(host); + break; + case ScoreCombiner.Average: + combiner = new Average(host); + break; + default: + throw host.Except("Unknown combiner kind"); + } + var ensemble = SchemaBindablePipelineEnsembleBase.Create(host, input.Models, combiner, MetadataUtils.Const.ScoreColumnKind.Regression); + return CreatePipelineEnsemble(host, input.Models, ensemble); + } + + [TlcModule.EntryPoint(Name = "Models.MultiClassPipelineEnsemble", Desc = "Combine multiclass classifiers into an ensemble")] + public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassPipelineEnsemble(IHostEnvironment env, PipelineClassifierInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("CombineModels"); + host.CheckValue(input, nameof(input)); + host.CheckNonEmpty(input.Models, nameof(input.Models)); + + IOutputCombiner> combiner; + switch (input.ModelCombiner) + { + case ClassifierCombiner.Median: + combiner = new MultiMedian(host, new MultiMedian.Arguments() { Normalize = true }); + break; + case ClassifierCombiner.Average: + combiner = new MultiAverage(host, new MultiAverage.Arguments() { Normalize = true }); + break; + case ClassifierCombiner.Vote: + combiner = new MultiVoting(host); + break; + default: + throw host.Except("Unknown combiner kind"); + } + var ensemble = SchemaBindablePipelineEnsembleBase.Create(host, input.Models, combiner, MetadataUtils.Const.ScoreColumnKind.MultiClassClassification); + return CreatePipelineEnsemble(host, input.Models, ensemble); + } + + [TlcModule.EntryPoint(Name = "Models.AnomalyPipelineEnsemble", Desc = "Combine anomaly detection models into an ensemble")] + public static CommonOutputs.AnomalyDetectionOutput CreateAnomalyPipelineEnsemble(IHostEnvironment env, PipelineAnomalyInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("CombineModels"); + host.CheckValue(input, nameof(input)); + host.CheckNonEmpty(input.Models, nameof(input.Models)); + + IOutputCombiner combiner; + switch (input.ModelCombiner) + { + case ScoreCombiner.Median: + combiner = new Median(host); + break; + case ScoreCombiner.Average: + combiner = new Average(host); + break; + default: + throw host.Except("Unknown combiner kind"); + } + var ensemble = SchemaBindablePipelineEnsembleBase.Create(host, input.Models, combiner, MetadataUtils.Const.ScoreColumnKind.AnomalyDetection); + return CreatePipelineEnsemble(host, input.Models, ensemble); + } + + private static TOut CreatePipelineEnsemble(IHostEnvironment env, IPredictorModel[] predictors, SchemaBindablePipelineEnsembleBase ensemble) + where TOut : CommonOutputs.TrainerOutput, new() + { + var inputSchema = predictors[0].TransformModel.InputSchema; + var dv = new EmptyDataView(env, inputSchema); + + // The role mappings are specific to the individual predictors. + var rmd = RoleMappedData.Create(dv); + var predictorModel = new PredictorModel(env, rmd, dv, ensemble); + + var output = new TOut { PredictorModel = predictorModel }; + return output; + } + + /// + /// This method takes a as input, saves it as an in-memory + /// and returns two arrays indexed by the entries in the zip: + /// 1. An array of byte arrays, containing the byte sequences of each entry. + /// 2. An array of strings, containing the name of each entry. + /// + /// This method is used for comparing pipelines. Its outputs can be passed to + /// to check if this pipeline is identical to another pipeline. + /// + public static void SerializeRoleMappedData(IHostEnvironment env, IChannel ch, RoleMappedData data, + out byte[][] dataSerialized, out string[] dataZipEntryNames) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ch, nameof(ch)); + ch.CheckValue(data, nameof(data)); + + using (var ms = new MemoryStream()) + { + TrainUtils.SaveModel(env, ch, ms, null, data); + var zip = new ZipArchive(ms); + var entries = zip.Entries.OrderBy(e => e.FullName).ToArray(); + dataSerialized = new byte[Utils.Size(entries)][]; + dataZipEntryNames = new string[Utils.Size(entries)]; + for (int i = 0; i < Utils.Size(entries); i++) + { + dataZipEntryNames[i] = entries[i].FullName; + dataSerialized[i] = new byte[entries[i].Length]; + using (var s = entries[i].Open()) + s.Read(dataSerialized[i], 0, (int)entries[i].Length); + } + } + } + + /// + /// This method compares two pipelines to make sure they are identical. The first pipeline is passed + /// as a , and the second as a double byte array and a string array. The double + /// byte array and the string array are obtained by calling on the + /// second pipeline. + /// The comparison is done by saving as an in-memory , + /// and for each entry in it, comparing its name, and the byte sequence to the corresponding entries in + /// and . + /// This method throws if for any of the entries the name/byte sequence are not identical. + /// + public static void CheckSamePipeline(IHostEnvironment env, IChannel ch, + RoleMappedData dataToCompare, byte[][] dataSerialized, string[] dataZipEntryNames) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ch, nameof(ch)); + ch.CheckValue(dataToCompare, nameof(dataToCompare)); + ch.CheckValue(dataSerialized, nameof(dataSerialized)); + ch.CheckValue(dataZipEntryNames, nameof(dataZipEntryNames)); + if (dataZipEntryNames.Length != dataSerialized.Length) + { + throw ch.ExceptParam(nameof(dataSerialized), + $"The length of {nameof(dataSerialized)} must be equal to the length of {nameof(dataZipEntryNames)}"); + } + + using (var ms = new MemoryStream()) + { + // REVIEW yaeld (tfinley): This can be done more efficiently by adding a custom type of repository that + // doesn't actually save the data, but upon stream closure compares the results to the given repository + // and then discards it. Currently, however, this cannot be done because ModelSaveContext does not use + // an abstract class/interface, but rather the RepositoryWriter class. + TrainUtils.SaveModel(env, ch, ms, null, dataToCompare); + + string errorMsg = "Models contain different pipelines, cannot ensemble them."; + var zip = new ZipArchive(ms); + var entries = zip.Entries.OrderBy(e => e.FullName).ToArray(); + ch.Check(dataSerialized.Length == Utils.Size(entries)); + byte[] buffer = null; + for (int i = 0; i < dataSerialized.Length; i++) + { + ch.Check(dataZipEntryNames[i] == entries[i].FullName, errorMsg); + int len = dataSerialized[i].Length; + if (Utils.Size(buffer) < len) + buffer = new byte[len]; + using (var s = entries[i].Open()) + { + int bytesRead = s.Read(buffer, 0, len); + ch.Check(bytesRead == len, errorMsg); + for (int j = 0; j < len; j++) + ch.Check(buffer[j] == dataSerialized[i][j], errorMsg); + if (s.Read(buffer, 0, 1) > 0) + throw env.Except(errorMsg); + } + } + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs new file mode 100644 index 0000000000..7b629bb498 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs @@ -0,0 +1,56 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.EntryPoints; +using Microsoft.ML.Runtime.EntryPoints; +using Microsoft.ML.Runtime.Internal.Calibration; + +[assembly: EntryPointModule(typeof(PipelineEnsemble))] + +namespace Microsoft.ML.Runtime.Ensemble.EntryPoints +{ + public static class PipelineEnsemble + { + public sealed class SummaryOutput + { + [TlcModule.Output(Desc = "The summaries of the individual predictors")] + public IDataView[] Summaries; + + [TlcModule.Output(Desc = "The model statistics of the individual predictors")] + public IDataView[] Stats; + } + + [TlcModule.EntryPoint(Name = "Models.EnsembleSummary", Desc = "Summarize a pipeline ensemble predictor.")] + public static SummaryOutput Summarize(IHostEnvironment env, SummarizePredictor.Input input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("PipelineEnsemblePredictor"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + input.PredictorModel.PrepareData(host, new EmptyDataView(host, input.PredictorModel.TransformModel.InputSchema), out RoleMappedData rmd, out IPredictor predictor); + + var calibrated = predictor as CalibratedPredictorBase; + while (calibrated != null) + { + predictor = calibrated.SubPredictor; + calibrated = predictor as CalibratedPredictorBase; + } + var ensemble = predictor as SchemaBindablePipelineEnsembleBase; + host.CheckUserArg(ensemble != null, nameof(input.PredictorModel.Predictor), "Predictor is not a pipeline ensemble predictor"); + + var summaries = new IDataView[ensemble.PredictorModels.Length]; + var stats = new IDataView[ensemble.PredictorModels.Length]; + for (int i = 0; i < ensemble.PredictorModels.Length; i++) + { + var pm = ensemble.PredictorModels[i]; + + pm.PrepareData(host, new EmptyDataView(host, pm.TransformModel.InputSchema), out rmd, out IPredictor pred); + summaries[i] = SummarizePredictor.GetSummaryAndStats(host, pred, rmd.Schema, out stats[i]); + } + return new SummaryOutput() { Summaries = summaries, Stats = stats }; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs b/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs new file mode 100644 index 0000000000..6eb373b6e6 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs @@ -0,0 +1,33 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections; +using System.Collections.Generic; +using Microsoft.ML.Runtime.Internal.Utilities; + +namespace Microsoft.ML.Runtime.Ensemble +{ + public sealed class FeatureSubsetModel + where TPredictor : IPredictor + { + public readonly TPredictor Predictor; + public readonly BitArray SelectedFeatures; + public readonly int Cardinality; + + public KeyValuePair[] Metrics { get; set; } + + public FeatureSubsetModel(TPredictor predictor, BitArray features = null, + KeyValuePair[] metrics = null) + { + Predictor = predictor; + int card; + if (features != null && (card = Utils.GetCardinality(features)) < features.Count) + { + SelectedFeatures = features; + Cardinality = card; + } + Metrics = metrics; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Microsoft.ML.Ensemble.csproj b/src/Microsoft.ML.Ensemble/Microsoft.ML.Ensemble.csproj new file mode 100644 index 0000000000..ddd4557788 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Microsoft.ML.Ensemble.csproj @@ -0,0 +1,15 @@ + + + + netstandard2.0 + Microsoft.ML.Ensemble + CORECLR + + + + + + + + + diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs new file mode 100644 index 0000000000..0992029cad --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs @@ -0,0 +1,62 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(Average), null, typeof(SignatureCombiner), Average.LoadName)] +[assembly: LoadableClass(typeof(Average), null, typeof(SignatureLoadModel), Average.LoadName, Average.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public sealed class Average : BaseAverager, ICanSaveModel, IRegressionOutputCombiner + { + public const string LoadName = "Average"; + public const string LoaderSignature = "AverageCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "AVG COMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public Average(IHostEnvironment env) + : base(env, LoaderSignature) + { + } + + private Average(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + } + + public static Average Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new Average(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + + public override Combiner GetCombiner() + { + // Force the weights to null. + return + (ref Single dst, Single[] src, Single[] weights) => + CombineCore(ref dst, src, null); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseAverager.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseAverager.cs new file mode 100644 index 0000000000..824300e594 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseAverager.cs @@ -0,0 +1,78 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime.Model; + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public abstract class BaseAverager : IBinaryOutputCombiner + { + protected readonly IHost Host; + public BaseAverager(IHostEnvironment env, string name) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckNonWhiteSpace(name, nameof(name)); + Host = env.Register(name); + } + + protected BaseAverager(IHostEnvironment env, string name, ModelLoadContext ctx) + { + Contracts.AssertValue(env); + env.AssertNonWhiteSpace(name); + Host = env.Register(name); + Host.CheckValue(ctx, nameof(ctx)); + + // *** Binary format *** + // int: sizeof(Single) + int cbFloat = ctx.Reader.ReadInt32(); + Host.CheckDecode(cbFloat == sizeof(Single)); + } + + public void Save(ModelSaveContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + SaveCore(ctx); + } + + protected virtual void SaveCore(ModelSaveContext ctx) + { + // *** Binary format *** + // int: sizeof(Single) + ctx.Writer.Write(sizeof(Single)); + } + + public abstract Combiner GetCombiner(); + + protected void CombineCore(ref Single dst, Single[] src, Single[] weights = null) + { + Single sum = 0; + Single weightTotal = 0; + if (weights == null) + { + for (int i = 0; i < src.Length; i++) + { + if (!Single.IsNaN(src[i])) + { + sum += src[i]; + weightTotal++; + } + } + } + else + { + for (int i = 0; i < src.Length; i++) + { + if (!Single.IsNaN(src[i])) + { + sum += weights[i] * src[i]; + weightTotal += weights[i]; + } + } + } + dst = sum / weightTotal; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs new file mode 100644 index 0000000000..d8e37ebcb4 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs @@ -0,0 +1,68 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.Numeric; + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public abstract class BaseMultiAverager : BaseMultiCombiner + { + internal BaseMultiAverager(IHostEnvironment env, string name, Arguments args) + : base(env, name, args) + { + } + + internal BaseMultiAverager(IHostEnvironment env, string name, ModelLoadContext ctx) + : base(env, name, ctx) + { + } + + protected void CombineCore(ref VBuffer dst, VBuffer[] src, Single[] weights = null) + { + Host.AssertNonEmpty(src); + Host.Assert(weights == null || Utils.Size(weights) == Utils.Size(src)); + + // REVIEW shonk: Should this be tolerant of NaNs? + int len = GetClassCount(src); + if (!TryNormalize(src)) + { + GetNaNOutput(ref dst, len); + return; + } + + var values = dst.Values; + if (Utils.Size(values) < len) + values = new Single[len]; + else + Array.Clear(values, 0, len); + + // Set the output to values. + dst = new VBuffer(len, values, dst.Indices); + + Single weightTotal; + if (weights == null) + { + weightTotal = src.Length; + for (int i = 0; i < src.Length; i++) + VectorUtils.Add(ref src[i], ref dst); + } + else + { + weightTotal = 0; + for (int i = 0; i < src.Length; i++) + { + var w = weights[i]; + weightTotal += w; + VectorUtils.AddMult(ref src[i], w, ref dst); + } + } + + VectorUtils.ScaleBy(ref dst, 1 / weightTotal); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs new file mode 100644 index 0000000000..9a345b3ebe --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs @@ -0,0 +1,109 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.Numeric; + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public abstract class BaseMultiCombiner : IOutputCombiner> + { + protected readonly IHost Host; + + public class Arguments + { + [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to normalize the output of base models before combining them", + ShortName = "norm", SortOrder = 50)] + public bool Normalize = true; + } + + protected readonly bool Normalize; + + internal BaseMultiCombiner(IHostEnvironment env, string name, Arguments args) + { + Contracts.AssertValue(env); + env.AssertNonWhiteSpace(name); + Host = env.Register(name); + Host.CheckValue(args, nameof(args)); + + Normalize = args.Normalize; + } + + internal BaseMultiCombiner(IHostEnvironment env, string name, ModelLoadContext ctx) + { + Contracts.AssertValue(env); + env.AssertNonWhiteSpace(name); + Host = env.Register(name); + Host.AssertValue(ctx); + + // *** Binary format *** + // int: sizeof(Single) + // bool: _normalize + int cbFloat = ctx.Reader.ReadInt32(); + Host.CheckDecode(cbFloat == sizeof(Single)); + Normalize = ctx.Reader.ReadBoolByte(); + } + + public void Save(ModelSaveContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + SaveCore(ctx); + } + + protected virtual void SaveCore(ModelSaveContext ctx) + { + // *** Binary format *** + // int: sizeof(Single) + // bool: _normalize + ctx.Writer.Write(sizeof(Single)); + ctx.Writer.WriteBoolByte(Normalize); + } + + public abstract Combiner> GetCombiner(); + + protected int GetClassCount(VBuffer[] values) + { + int len = 0; + foreach (var item in values) + { + if (len < item.Length) + len = item.Length; + } + return len; + } + + protected bool TryNormalize(VBuffer[] values) + { + if (!Normalize) + return true; + + for (int i = 0; i < values.Length; i++) + { + // Leave a zero vector as all zeros. Otherwise, make the L1 norm equal to 1. + var sum = VectorUtils.L1Norm(ref values[i]); + if (!FloatUtils.IsFinite(sum)) + return false; + if (sum > 0) + VectorUtils.ScaleBy(ref values[i], 1 / sum); + } + return true; + } + + protected void GetNaNOutput(ref VBuffer dst, int len) + { + Contracts.Assert(len >= 0); + var values = dst.Values; + if (Utils.Size(values) < len) + values = new Single[len]; + for (int i = 0; i < len; i++) + values[i] = Single.NaN; + dst = new VBuffer(len, values, dst.Indices); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs new file mode 100644 index 0000000000..a5c9c757a4 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseScalarStacking.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public abstract class BaseScalarStacking : BaseStacking + { + internal BaseScalarStacking(IHostEnvironment env, string name, ArgumentsBase args) + : base(env, name, args) + { + } + + internal BaseScalarStacking(IHostEnvironment env, string name, ModelLoadContext ctx) + : base(env, name, ctx) + { + } + + protected override void FillFeatureBuffer(Single[] src, ref VBuffer dst) + { + Contracts.AssertNonEmpty(src); + int len = src.Length; + var values = dst.Values; + if (Utils.Size(values) < len) + values = new Single[len]; + Array.Copy(src, values, len); + dst = new VBuffer(len, values, dst.Indices); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs new file mode 100644 index 0000000000..6b9badd1c5 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs @@ -0,0 +1,199 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Threading.Tasks; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.Training; + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + using ColumnRole = RoleMappedSchema.ColumnRole; + public abstract class BaseStacking : IStackingTrainer + { + public abstract class ArgumentsBase + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", ShortName = "vp", SortOrder = 50)] + [TGUI(Label = "Validation Dataset Proportion")] + public Single ValidationDatasetProportion = 0.3f; + + [Argument(ArgumentType.Multiple, HelpText = "Base predictor for meta learning", ShortName = "bp", SortOrder = 50)] + [TGUI(Label = "Base predictor")] + public SubComponent>, TSigBase> BasePredictorType; + } + + protected readonly SubComponent>, TSigBase> BasePredictorType; + protected readonly IHost Host; + protected IPredictorProducing Meta; + + public Single ValidationDatasetProportion { get; } + + internal BaseStacking(IHostEnvironment env, string name, ArgumentsBase args) + { + Contracts.AssertValue(env); + env.AssertNonWhiteSpace(name); + Host = env.Register(name); + Host.AssertValue(args, "args"); + Host.CheckUserArg(0 <= args.ValidationDatasetProportion && args.ValidationDatasetProportion < 1, + nameof(args.ValidationDatasetProportion), + "The validation proportion for stacking should be greater than or equal to 0 and less than 1"); + Host.CheckUserArg(args.BasePredictorType.IsGood(), nameof(args.BasePredictorType)); + + ValidationDatasetProportion = args.ValidationDatasetProportion; + BasePredictorType = args.BasePredictorType; + } + + internal BaseStacking(IHostEnvironment env, string name, ModelLoadContext ctx) + { + Contracts.AssertValue(env); + env.AssertNonWhiteSpace(name); + Host = env.Register(name); + Host.AssertValue(ctx); + + // *** Binary format *** + // int: sizeof(Single) + // Float: _validationDatasetProportion + int cbFloat = ctx.Reader.ReadInt32(); + env.CheckDecode(cbFloat == sizeof(Single)); + ValidationDatasetProportion = ctx.Reader.ReadFloat(); + env.CheckDecode(0 <= ValidationDatasetProportion && ValidationDatasetProportion < 1); + + ctx.LoadModel, SignatureLoadModel>(env, out Meta, "MetaPredictor"); + CheckMeta(); + } + + public void Save(ModelSaveContext ctx) + { + Host.Check(Meta != null, "Can't save an untrained Stacking combiner"); + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + SaveCore(ctx); + } + + protected virtual void SaveCore(ModelSaveContext ctx) + { + Host.Assert(Meta != null); + + // *** Binary format *** + // int: sizeof(Single) + // Float: _validationDatasetProportion + ctx.Writer.Write(sizeof(Single)); + ctx.Writer.Write(ValidationDatasetProportion); + + ctx.SaveModel(Meta, "MetaPredictor"); + } + + public Combiner GetCombiner() + { + Contracts.Check(Meta != null, "Training of stacking combiner not complete"); + + // Subtle point: We shouldn't get the ValueMapper delegate and cache it in a field + // since generally ValueMappers cannot be assumed to be thread safe - they often + // capture buffers needed for efficient operation. + var mapper = (IValueMapper)Meta; + var map = mapper.GetMapper, TOutput>(); + + var feat = default(VBuffer); + Combiner res = + (ref TOutput dst, TOutput[] src, Single[] weights) => + { + FillFeatureBuffer(src, ref feat); + map(ref feat, ref dst); + }; + return res; + } + + protected abstract void FillFeatureBuffer(TOutput[] src, ref VBuffer dst); + + private void CheckMeta() + { + Contracts.Assert(Meta != null); + + var ivm = Meta as IValueMapper; + Contracts.Check(ivm != null, "Stacking predictor doesn't implement the expected interface"); + if (!ivm.InputType.IsVector || ivm.InputType.ItemType != NumberType.Float) + throw Contracts.Except("Stacking predictor input type is unsupported: {0}", ivm.InputType); + if (ivm.OutputType.RawType != typeof(TOutput)) + throw Contracts.Except("Stacking predictor output type is unsupported: {0}", ivm.OutputType); + } + + public void Train(List>> models, RoleMappedData data, IHostEnvironment env) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register(Stacking.LoadName); + host.CheckValue(models, nameof(models)); + host.CheckValue(data, nameof(data)); + + using (var ch = host.Start("Training stacked model")) + { + ch.Check(Meta == null, "Train called multiple times"); + ch.Check(BasePredictorType != null); + + var maps = new ValueMapper, TOutput>[models.Count]; + for (int i = 0; i < maps.Length; i++) + { + Contracts.Assert(models[i].Predictor is IValueMapper); + var m = (IValueMapper)models[i].Predictor; + maps[i] = m.GetMapper, TOutput>(); + } + + // REVIEW shonk: Should implement this better.... + var labels = new Single[100]; + var features = new VBuffer[100]; + int count = 0; + // REVIEW shonk: Should this include bad values or filter them? + using (var cursor = new FloatLabelCursor(data, CursOpt.AllFeatures | CursOpt.AllLabels)) + { + TOutput[] predictions = new TOutput[maps.Length]; + var vBuffers = new VBuffer[maps.Length]; + while (cursor.MoveNext()) + { + Parallel.For(0, maps.Length, i => + { + var model = models[i]; + if (model.SelectedFeatures != null) + { + EnsembleUtils.SelectFeatures(ref cursor.Features, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]); + maps[i](ref vBuffers[i], ref predictions[i]); + } + else + maps[i](ref cursor.Features, ref predictions[i]); + }); + + Utils.EnsureSize(ref labels, count + 1); + Utils.EnsureSize(ref features, count + 1); + labels[count] = cursor.Label; + FillFeatureBuffer(predictions, ref features[count]); + count++; + } + } + + ch.Info("The number of instances used for stacking trainer is {0}", count); + + var bldr = new ArrayDataViewBuilder(host); + Array.Resize(ref labels, count); + Array.Resize(ref features, count); + bldr.AddColumn("Label", NumberType.Float, labels); + bldr.AddColumn("Features", NumberType.Float, features); + + var view = bldr.GetDataView(); + var rmd = RoleMappedData.Create(view, ColumnRole.Label.Bind("Label"), ColumnRole.Feature.Bind("Features")); + + var trainer = BasePredictorType.CreateInstance(host); + if (trainer is ITrainerEx ex && ex.NeedNormalization) + ch.Warning("The trainer specified for stacking wants normalization, but we do not currently allow this."); + trainer.Train(rmd); + Meta = trainer.CreatePredictor(); + CheckMeta(); + + ch.Done(); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs new file mode 100644 index 0000000000..1995114d20 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs @@ -0,0 +1,54 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + /// + /// Signature for combiners. + /// + public delegate void SignatureCombiner(); + + public delegate void Combiner(ref TOutput dst, TOutput[] src, Single[] weights); + + public interface IOutputCombiner + { + } + + /// + /// Generic interface for combining outputs of multiple models + /// + public interface IOutputCombiner : IOutputCombiner + { + Combiner GetCombiner(); + } + + public interface IStackingTrainer + { + void Train(List>> models, RoleMappedData data, IHostEnvironment env); + Single ValidationDatasetProportion { get; } + } + + public interface IRegressionOutputCombiner : IOutputCombiner + { + } + + public interface IBinaryOutputCombiner : IOutputCombiner + { + } + + public interface ISupportOutputCombinerFactory : IComponentFactory> + { + } + + public interface IWeightedAverager + { + string WeightageMetricName { get; } + } + +} \ No newline at end of file diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs new file mode 100644 index 0000000000..e29df82e52 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs @@ -0,0 +1,87 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(Median), null, typeof(SignatureCombiner), Median.UserName, Median.LoadName)] +[assembly: LoadableClass(typeof(Median), null, typeof(SignatureLoadModel), Median.UserName, Median.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + /// + /// Generic interface for combining outputs of multiple models + /// + public sealed class Median : IRegressionOutputCombiner, IBinaryOutputCombiner, ICanSaveModel + { + private readonly IHost _host; + public const string UserName = "Median"; + public const string LoadName = "Median"; + public const string LoaderSignature = "MedianCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "MEDICOMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public Median(IHostEnvironment env) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(LoaderSignature); + } + + private Median(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.AssertValue(env); + _host = env.Register(LoaderSignature); + + // *** Binary format *** + // int: sizeof(Single) + int cbFloat = ctx.Reader.ReadInt32(); + _host.CheckDecode(cbFloat == sizeof(Single)); + } + + public static Median Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new Median(env, ctx); + } + + public void Save(ModelSaveContext ctx) + { + _host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: sizeof(Float) + ctx.Writer.Write(sizeof(Single)); + } + + public Combiner GetCombiner() + { + return CombineCore; + } + + private void CombineCore(ref Single dst, Single[] src, Single[] weights) + { + // REVIEW shonk: This mutates "src". We need to ensure that the documentation of + // combiners makes it clear that combiners are allowed to do this. Note that "normalization" + // in the multi-class case also mutates. + _host.AssertNonEmpty(src); + _host.Assert(weights == null || Utils.Size(weights) == Utils.Size(src)); + dst = MathUtils.GetMedianInPlace(src, src.Length); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs new file mode 100644 index 0000000000..73c10e83fd --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(MultiAverage), typeof(MultiAverage.Arguments), typeof(SignatureCombiner), + Average.LoadName, MultiAverage.LoadName)] +[assembly: LoadableClass(typeof(MultiAverage), null, typeof(SignatureLoadModel), + Average.LoadName, MultiAverage.LoadName, MultiAverage.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public sealed class MultiAverage : BaseMultiAverager, ICanSaveModel + { + public const string LoadName = "MultiAverage"; + public const string LoaderSignature = "MultiAverageCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "MAVGCOMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public MultiAverage(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) + { + } + + private MultiAverage(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + } + + public static MultiAverage Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new MultiAverage(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + + public override Combiner> GetCombiner() + { + // Force the weights to null. + return + (ref VBuffer dst, VBuffer[] src, Single[] weights) => + CombineCore(ref dst, src, null); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs new file mode 100644 index 0000000000..0e94315d21 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(MultiMedian), typeof(MultiMedian.Arguments), typeof(SignatureCombiner), + Median.UserName, MultiMedian.LoadName)] +[assembly: LoadableClass(typeof(MultiMedian), null, typeof(SignatureLoadModel),Median.UserName, MultiMedian.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + /// + /// Generic interface for combining outputs of multiple models + /// + public sealed class MultiMedian : BaseMultiCombiner, ICanSaveModel + { + public const string LoadName = "MultiMedian"; + public const string LoaderSignature = "MultiMedianCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "MMEDCOMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public MultiMedian(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) + { + } + + private MultiMedian(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + } + + public static MultiMedian Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new MultiMedian(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + + public override Combiner> GetCombiner() + { + Single[] raw = null; + return + (ref VBuffer dst, VBuffer[] src, Single[] weights) => + { + Host.AssertNonEmpty(src); + Host.Assert(weights == null || Utils.Size(weights) == Utils.Size(src)); + + int len = GetClassCount(src); + if (!TryNormalize(src)) + { + GetNaNOutput(ref dst, len); + return; + } + + var values = dst.Values; + if (Utils.Size(values) < len) + values = new Single[len]; + + int count = src.Length; + if (Utils.Size(raw) < count) + raw = new Single[count]; + for (int i = 0; i < len; i++) + { + for (int j = 0; j < count; j++) + raw[j] = i < src[j].Length ? src[j].GetItemOrDefault(i) : 0; + values[i] = MathUtils.GetMedianInPlace(raw, count); + } + + // Set the output to values. + dst = new VBuffer(len, values, dst.Indices); + }; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs new file mode 100644 index 0000000000..845eab0500 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs @@ -0,0 +1,96 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + + +[assembly: LoadableClass(typeof(MultiStacking), typeof(MultiStacking.Arguments), typeof(SignatureCombiner), + Stacking.UserName, MultiStacking.LoadName)] + +[assembly: LoadableClass(typeof(MultiStacking), null, typeof(SignatureLoadModel), + Stacking.UserName, MultiStacking.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + using TVectorPredictor = IPredictorProducing>; + public sealed class MultiStacking : BaseStacking, SignatureMultiClassClassifierTrainer>, ICanSaveModel, IOutputCombiner> + { + public const string LoadName = "MultiStacking"; + public const string LoaderSignature = "MultiStackingCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "MSTACK C", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public class Arguments : ArgumentsBase + { + public Arguments() + { + // REVIEW tfinley: Kinda stupid. Perhaps we can have a better non-parametetric learner. + BasePredictorType = new SubComponent, SignatureMultiClassClassifierTrainer>( + "OVA", "p=FastTreeBinaryClassification"); + } + } + + public MultiStacking(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) + { + } + + private MultiStacking(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + } + + public static MultiStacking Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new MultiStacking(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + + protected override void FillFeatureBuffer(VBuffer[] src, ref VBuffer dst) + { + Contracts.AssertNonEmpty(src); + + // REVIEW shonk: Would there be any value in ever making dst sparse? + int len = 0; + for (int i = 0; i < src.Length; i++) + len += src[i].Length; + + var values = dst.Values; + if (Utils.Size(values) < len) + values = new Single[len]; + dst = new VBuffer(len, values, dst.Indices); + + int iv = 0; + for (int i = 0; i < src.Length; i++) + { + src[i].CopyTo(values, iv); + iv += src[i].Length; + Contracts.Assert(iv <= len); + } + Contracts.Assert(iv == len); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs new file mode 100644 index 0000000000..802a756f5c --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs @@ -0,0 +1,105 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.Numeric; + +[assembly: LoadableClass(typeof(MultiVoting), null, typeof(SignatureCombiner), Voting.UserName, MultiVoting.LoadName)] +[assembly: LoadableClass(typeof(MultiVoting), null, typeof(SignatureLoadModel), Voting.UserName, MultiVoting.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + // REVIEW shonk: Why is MultiVoting based on BaseMultiCombiner? Normalizing the model outputs + // is senseless, so the base adds no real functionality. + public sealed class MultiVoting : BaseMultiCombiner, ICanSaveModel + { + public const string LoadName = "MultiVoting"; + public const string LoaderSignature = "MultiVotingCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "MVOTCOMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public MultiVoting(IHostEnvironment env) + : base(env, LoaderSignature, new Arguments() { Normalize = false }) + { + Host.Assert(!Normalize); + } + + private MultiVoting(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + Host.CheckDecode(!Normalize); + } + + public static MultiVoting Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new MultiVoting(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + Contracts.Assert(!Normalize); + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + + public override Combiner> GetCombiner() + { + return CombineCore; + } + + private void CombineCore(ref VBuffer dst, VBuffer[] src, Single[] weights = null) + { + Host.AssertNonEmpty(src); + Host.Assert(weights == null || Utils.Size(weights) == Utils.Size(src)); + + int count = Utils.Size(src); + if (count == 0) + { + dst = new VBuffer(0, dst.Values, dst.Indices); + return; + } + + int len = GetClassCount(src); + var values = dst.Values; + if (Utils.Size(values) < len) + values = new Single[len]; + else + Array.Clear(values, 0, len); + + int voteCount = 0; + for (int i = 0; i < count; i++) + { + int index = VectorUtils.ArgMax(ref src[i]); + if (index >= 0) + { + values[index]++; + voteCount++; + } + } + + // Normalize by dividing by the number of votes. + for (int i = 0; i < len; i++) + values[i] /= voteCount; + + // Set the output to values. + dst = new VBuffer(len, values, dst.Indices); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs new file mode 100644 index 0000000000..8d2439ec7a --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs @@ -0,0 +1,100 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(MultiWeightedAverage), typeof(MultiWeightedAverage.Arguments), typeof(SignatureCombiner), + MultiWeightedAverage.UserName, MultiWeightedAverage.LoadName)] + +[assembly: LoadableClass(typeof(MultiWeightedAverage), null, typeof(SignatureLoadModel), + MultiWeightedAverage.UserName, MultiWeightedAverage.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + // These values are serialized, so should not be changed. + public enum MultiWeightageKind + { + [TGUI(Label = MultiClassClassifierEvaluator.AccuracyMicro)] + AccuracyMicroAvg = 0, + [TGUI(Label = MultiClassClassifierEvaluator.AccuracyMacro)] + AccuracyMacroAvg = 1 + } + + /// + /// Generic interface for combining outputs of multiple models + /// + public sealed class MultiWeightedAverage : BaseMultiAverager, IWeightedAverager, ICanSaveModel + { + public const string UserName = "Multi Weighted Average"; + public const string LoadName = "MultiWeightedAverage"; + public const string LoaderSignature = "MultiWeightedAverageCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "MWAVCOMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public new class Arguments : BaseMultiCombiner.Arguments + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the weights for each model", ShortName = "wn", SortOrder = 50)] + [TGUI(Label = "Metric Name", Description = "The weights are calculated according to the selected metric")] + public MultiWeightageKind WeightageName = MultiWeightageKind.AccuracyMicroAvg; + } + + private readonly MultiWeightageKind _weightageKind; + public string WeightageMetricName { get { return _weightageKind.ToString(); } } + + public MultiWeightedAverage(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) + { + _weightageKind = args.WeightageName; + Host.CheckUserArg(Enum.IsDefined(typeof(MultiWeightageKind), _weightageKind), nameof(args.WeightageName)); + } + + private MultiWeightedAverage(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + // *** Binary format *** + // int: _weightageKind + + _weightageKind = (MultiWeightageKind)ctx.Reader.ReadInt32(); + Host.CheckDecode(Enum.IsDefined(typeof(MultiWeightageKind), _weightageKind)); + } + + public static MultiWeightedAverage Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new MultiWeightedAverage(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + // *** Binary format *** + // int: _weightageKind + + Host.Assert(Enum.IsDefined(typeof(MultiWeightageKind), _weightageKind)); + ctx.Writer.Write((int)_weightageKind); + } + + public override Combiner> GetCombiner() + { + return CombineCore; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs new file mode 100644 index 0000000000..31c43970bc --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs @@ -0,0 +1,65 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(RegressionStacking), typeof(RegressionStacking.Arguments), typeof(SignatureCombiner), + Stacking.UserName, RegressionStacking.LoaderSignature)] +[assembly: LoadableClass(typeof(RegressionStacking), null, typeof(SignatureLoadModel), + Stacking.UserName, RegressionStacking.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + using TScalarPredictor = IPredictorProducing; + public sealed class RegressionStacking : BaseScalarStacking, IRegressionOutputCombiner, ICanSaveModel + { + public const string LoaderSignature = "RegressionStacking"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "RSTACK C", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public class Arguments : ArgumentsBase + { + public Arguments() + { + BasePredictorType = new SubComponent, SignatureRegressorTrainer>("FastTreeRegression"); + } + } + + public RegressionStacking(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) + { + } + + private RegressionStacking(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + } + + public static RegressionStacking Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new RegressionStacking(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs new file mode 100644 index 0000000000..af582e98e9 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs @@ -0,0 +1,66 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(Stacking), typeof(Stacking.Arguments), typeof(SignatureCombiner),Stacking.UserName, Stacking.LoadName)] +[assembly: LoadableClass(typeof(Stacking), null, typeof(SignatureLoadModel), Stacking.UserName, Stacking.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + using TScalarPredictor = IPredictorProducing; + public sealed class Stacking : BaseScalarStacking, IBinaryOutputCombiner, ICanSaveModel + { + public const string UserName = "Stacking"; + public const string LoadName = "Stacking"; + public const string LoaderSignature = "StackingCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: " STACK C", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public class Arguments : ArgumentsBase + { + public Arguments() + { + BasePredictorType = new SubComponent, SignatureBinaryClassifierTrainer>("FastTreeBinaryClassification"); + } + } + + public Stacking(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) + { + } + + private Stacking(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + } + + public static Stacking Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new Stacking(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs new file mode 100644 index 0000000000..2b90e6a638 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs @@ -0,0 +1,93 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(Voting), null, typeof(SignatureCombiner), Voting.UserName, Voting.UserName)] +[assembly: LoadableClass(typeof(Voting), null, typeof(SignatureLoadModel), Voting.UserName, Voting.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public sealed class Voting : IBinaryOutputCombiner, ICanSaveModel + { + private readonly IHost _host; + public const string UserName = "Voting"; + public const string LoaderSignature = "VotingCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "VOT COMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public Voting(IHostEnvironment env) + { + Contracts.CheckValue(env, nameof(env)); + _host = env.Register(LoaderSignature); + } + + private Voting(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.AssertValue(env); + _host = env.Register(LoaderSignature); + _host.AssertValue(ctx); + + // *** Binary format *** + // int: sizeof(Single) + int cbFloat = ctx.Reader.ReadInt32(); + _host.CheckDecode(cbFloat == sizeof(Single)); + } + + public static Voting Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new Voting(env, ctx); + } + + public void Save(ModelSaveContext ctx) + { + _host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: sizeof(Single) + ctx.Writer.Write(sizeof(Single)); + } + + public Combiner GetCombiner() + { + return CombineCore; + } + + private void CombineCore(ref Single dst, Single[] src, Single[] weights) + { + _host.AssertNonEmpty(src); + _host.Assert(weights == null || Utils.Size(weights) == Utils.Size(src)); + + int len = Utils.Size(src); + int pos = 0; + int neg = 0; + for (int i = 0; i < src.Length; i++) + { + var v = src[i]; + if (v > 0) + pos++; + else if (v <= 0) + neg++; + } + dst = (Single)(pos - neg) / (pos + neg); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs new file mode 100644 index 0000000000..caa582473b --- /dev/null +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs @@ -0,0 +1,107 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(WeightedAverage), typeof(WeightedAverage.Arguments), typeof(SignatureCombiner), + WeightedAverage.UserName, WeightedAverage.LoadName)] + +[assembly: LoadableClass(typeof(WeightedAverage), null, typeof(SignatureLoadModel), + WeightedAverage.UserName, WeightedAverage.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners +{ + public sealed class WeightedAverage : BaseAverager, IWeightedAverager, ICanSaveModel + { + public const string UserName = "Weighted Average"; + public const string LoadName = "WeightedAverage"; + public const string LoaderSignature = "WeightedAverageCombiner"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "WAVGCOMB", + verWrittenCur: 0x00010001, + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + + public class Arguments + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the weights for each model", ShortName = "wn", SortOrder = 50)] + [TGUI(Label = "Weightage Name", Description = "The weights are calculated according to the selected metric")] + public WeightageKind WeightageName = WeightageKind.Auc; + } + + private WeightageKind _weightageKind; + + public string WeightageMetricName { get { return _weightageKind.ToString(); } } + + public WeightedAverage(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature) + { + _weightageKind = args.WeightageName; + Host.CheckUserArg(Enum.IsDefined(typeof(WeightageKind), _weightageKind), nameof(args.WeightageName)); + } + + private WeightedAverage(IHostEnvironment env, ModelLoadContext ctx) + : base(env, LoaderSignature, ctx) + { + // *** Binary format *** + // int: _weightageKind + _weightageKind = (WeightageKind)ctx.Reader.ReadInt32(); + Host.CheckDecode(Enum.IsDefined(typeof(WeightageKind), _weightageKind)); + } + + public static WeightedAverage Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new WeightedAverage(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: _weightageKind + + Contracts.Assert(Enum.IsDefined(typeof(WeightageKind), _weightageKind)); + ctx.Writer.Write((int)_weightageKind); + } + + public override Combiner GetCombiner() + { + return CombineCore; + } + } + + // These values are serialized, so should not be changed. + public enum WeightageKind + { + [TGUI(Label = BinaryClassifierEvaluator.Accuracy)] + Accuracy = 0, + [TGUI(Label = BinaryClassifierEvaluator.Auc)] + Auc = 1, + [TGUI(Label = BinaryClassifierEvaluator.PosPrecName)] + PosPrecision = 2, + [TGUI(Label = BinaryClassifierEvaluator.PosRecallName)] + PosRecall = 3, + [TGUI(Label = BinaryClassifierEvaluator.NegPrecName)] + NegPrecision = 4, + [TGUI(Label = BinaryClassifierEvaluator.NegRecallName)] + NegRecall = 5, + } + +} diff --git a/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs b/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs new file mode 100644 index 0000000000..441d0afc02 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs @@ -0,0 +1,753 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; +using Microsoft.ML.Runtime.Internal.Calibration; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +[assembly: LoadableClass(typeof(SchemaBindablePipelineEnsembleBase), null, typeof(SignatureLoadModel), + SchemaBindablePipelineEnsembleBase.UserName, SchemaBindablePipelineEnsembleBase.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble +{ + /// + /// This class represents an ensemble predictor, where each predictor has its own featurization pipeline. It is + /// useful for the distributed training scenario, where the featurization includes trainable transforms (for example, + /// categorical transform, or normalization). + /// + public abstract class SchemaBindablePipelineEnsembleBase : ICanGetTrainingLabelNames, ICanSaveModel, ISchemaBindableMapper, ICanSaveSummary, ICanGetSummaryInKeyValuePairs + { + private abstract class BoundBase : ISchemaBoundRowMapper + { + protected readonly SchemaBindablePipelineEnsembleBase Parent; + private readonly HashSet _inputColIndices; + + protected readonly ISchemaBoundRowMapper[] Mappers; + protected readonly IRowToRowMapper[] BoundPipelines; + protected readonly int[] ScoreCols; + + public ISchemaBindableMapper Bindable + { + get { return Parent; } + } + + public RoleMappedSchema InputSchema { get; } + + public ISchema OutputSchema { get; } + + public BoundBase(SchemaBindablePipelineEnsembleBase parent, RoleMappedSchema schema) + { + Parent = parent; + InputSchema = schema; + OutputSchema = new ScoreMapperSchema(Parent.ScoreType, Parent._scoreColumnKind); + _inputColIndices = new HashSet(); + for (int i = 0; i < Parent._inputCols.Length; i++) + { + var name = Parent._inputCols[i]; + if (!InputSchema.Schema.TryGetColumnIndex(name, out int col)) + throw Parent.Host.Except("Schema does not contain required input column '{0}'", name); + _inputColIndices.Add(col); + } + + Mappers = new ISchemaBoundRowMapper[Parent.PredictorModels.Length]; + BoundPipelines = new IRowToRowMapper[Parent.PredictorModels.Length]; + ScoreCols = new int[Parent.PredictorModels.Length]; + for (int i = 0; i < Mappers.Length; i++) + { + // Get the RoleMappedSchema to pass to the predictor. + var emptyDv = new EmptyDataView(Parent.Host, schema.Schema); + Parent.PredictorModels[i].PrepareData(Parent.Host, emptyDv, out RoleMappedData rmd, out IPredictor predictor); + + // Get the predictor as a bindable mapper, and bind it to the RoleMappedSchema found above. + var bindable = ScoreUtils.GetSchemaBindableMapper(Parent.Host, Parent.PredictorModels[i].Predictor, null); + Mappers[i] = bindable.Bind(Parent.Host, rmd.Schema) as ISchemaBoundRowMapper; + if (Mappers[i] == null) + throw Parent.Host.Except("Predictor {0} is not a row to row mapper", i); + + // Make sure there is a score column, and remember its index. + if (!Mappers[i].OutputSchema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out ScoreCols[i])) + throw Parent.Host.Except("Predictor {0} does not contain a score column", i); + + // Get the pipeline. + var dv = new EmptyDataView(Parent.Host, schema.Schema); + var tm = new TransformModel(Parent.Host, dv, dv); + var pipeline = Parent.PredictorModels[i].TransformModel.Apply(Parent.Host, tm); + BoundPipelines[i] = pipeline.AsRowToRowMapper(Parent.Host); + if (BoundPipelines[i] == null) + throw Parent.Host.Except("Transform pipeline {0} contains transforms that do not implement IRowToRowMapper", i); + } + } + + public Func GetDependencies(Func predicate) + { + for (int i = 0; i < OutputSchema.ColumnCount; i++) + { + if (predicate(i)) + return col => _inputColIndices.Contains(col); + } + return col => false; + } + + public IEnumerable> GetInputColumnRoles() + { + yield break; + } + + public IRow GetOutputRow(IRow input, Func predicate, out Action disposer) + { + return new SimpleRow(OutputSchema, input, new[] { CreateScoreGetter(input, predicate, out disposer) }); + } + + public abstract Delegate CreateScoreGetter(IRow input, Func mapperPredicate, out Action disposer); + } + + // A generic base class for pipeline ensembles. This class contains the combiner. + private abstract class SchemaBindablePipelineEnsemble : SchemaBindablePipelineEnsembleBase, IPredictorProducing + { + protected sealed class Bound : BoundBase + { + private readonly IOutputCombiner _combiner; + + public Bound(SchemaBindablePipelineEnsemble parent, RoleMappedSchema schema) + : base(parent, schema) + { + _combiner = parent.Combiner; + } + + public override Delegate CreateScoreGetter(IRow input, Func mapperPredicate, out Action disposer) + { + disposer = null; + + if (!mapperPredicate(0)) + return null; + + var getters = new ValueGetter[Mappers.Length]; + for (int i = 0; i < Mappers.Length; i++) + { + // First get the output row from the pipelines. The input predicate of the predictor + // is the output predicate of the pipeline. + var inputPredicate = Mappers[i].GetDependencies(mapperPredicate); + var pipelineRow = BoundPipelines[i].GetRow(input, inputPredicate, out Action disp); + disposer += disp; + + // Next we get the output row from the predictors. We activate the score column as output predicate. + var predictorRow = Mappers[i].GetOutputRow(pipelineRow, col => col == ScoreCols[i], out disp); + disposer += disp; + getters[i] = predictorRow.GetGetter(ScoreCols[i]); + } + + var comb = _combiner.GetCombiner(); + var buffer = new T[Mappers.Length]; + ValueGetter scoreGetter = + (ref T dst) => + { + for (int i = 0; i < Mappers.Length; i++) + getters[i](ref buffer[i]); + comb(ref dst, buffer, null); + }; + return scoreGetter; + } + + public ValueGetter GetLabelGetter(IRow input, int i, out Action disposer) + { + Parent.Host.Assert(0 <= i && i < Mappers.Length); + Parent.Host.Check(Mappers[i].InputSchema.Label != null, "Mapper was not trained using a label column"); + + // The label should be in the output row of the i'th pipeline + var pipelineRow = BoundPipelines[i].GetRow(input, col => col == Mappers[i].InputSchema.Label.Index, out disposer); + return RowCursorUtils.GetLabelGetter(pipelineRow, Mappers[i].InputSchema.Label.Index); + } + + public ValueGetter GetWeightGetter(IRow input, int i, out Action disposer) + { + Parent.Host.Assert(0 <= i && i < Mappers.Length); + + if (Mappers[i].InputSchema.Weight == null) + { + ValueGetter weight = (ref Single dst) => dst = 1; + disposer = null; + return weight; + } + // The weight should be in the output row of the i'th pipeline if it exists. + var inputPredicate = Mappers[i].GetDependencies(col => col == Mappers[i].InputSchema.Weight.Index); + var pipelineRow = BoundPipelines[i].GetRow(input, inputPredicate, out disposer); + return pipelineRow.GetGetter(Mappers[i].InputSchema.Weight.Index); + } + } + + protected readonly IOutputCombiner Combiner; + + protected SchemaBindablePipelineEnsemble(IHostEnvironment env, IPredictorModel[] predictors, + IOutputCombiner combiner, string registrationName, string scoreColumnKind) + : base(env, predictors, registrationName, scoreColumnKind) + { + Combiner = combiner; + } + + protected SchemaBindablePipelineEnsemble(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind) + : base(env, ctx, scoreColumnKind) + { + // *** Binary format *** + // + // The combiner + + ctx.LoadModel, SignatureLoadModel>(Host, out Combiner, "Combiner"); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + Host.AssertValue(ctx); + + // *** Binary format *** + // + // The combiner + + ctx.SaveModel(Combiner, "Combiner"); + } + + public override ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema schema) + { + return new Bound(this, schema); + } + } + + // This is an implementation of pipeline ensembles that combines scores of type float (regression and anomaly detection). + private sealed class ImplOne : SchemaBindablePipelineEnsemble + { + protected override ColumnType ScoreType { get { return NumberType.R4; } } + + public override PredictionKind PredictionKind + { + get + { + if (_scoreColumnKind == MetadataUtils.Const.ScoreColumnKind.Regression) + return PredictionKind.Regression; + if (_scoreColumnKind == MetadataUtils.Const.ScoreColumnKind.AnomalyDetection) + return PredictionKind.AnomalyDetection; + throw Host.Except("Unknown prediction kind"); + } + } + + public ImplOne(IHostEnvironment env, IPredictorModel[] predictors, IOutputCombiner combiner, string scoreColumnKind) + : base(env, predictors, combiner, LoaderSignature, scoreColumnKind) + { + } + + public ImplOne(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind) + : base(env, ctx, scoreColumnKind) + { + } + } + + // This is an implementation of pipeline ensemble that combines scores of type vectors of float (multi-class). + private sealed class ImplVec : SchemaBindablePipelineEnsemble> + { + protected override ColumnType ScoreType { get { return _scoreType; } } + + public override PredictionKind PredictionKind + { + get + { + if (_scoreColumnKind == MetadataUtils.Const.ScoreColumnKind.MultiClassClassification) + return PredictionKind.MultiClassClassification; + throw Host.Except("Unknown prediction kind"); + } + } + + private readonly VectorType _scoreType; + + public ImplVec(IHostEnvironment env, IPredictorModel[] predictors, IOutputCombiner> combiner) + : base(env, predictors, combiner, LoaderSignature, MetadataUtils.Const.ScoreColumnKind.MultiClassClassification) + { + int classCount = CheckLabelColumn(Host, predictors, false); + _scoreType = new VectorType(NumberType.R4, classCount); + } + + public ImplVec(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind) + : base(env, ctx, scoreColumnKind) + { + int classCount = CheckLabelColumn(Host, PredictorModels, false); + _scoreType = new VectorType(NumberType.R4, classCount); + } + } + + // This is an implementation of pipeline ensembles that combines scores of type float, and also provides calibration (for binary classification). + private sealed class ImplOneWithCalibrator : SchemaBindablePipelineEnsemble, ISelfCalibratingPredictor + { + protected override ColumnType ScoreType { get { return NumberType.R4; } } + + public override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } } + + public ImplOneWithCalibrator(IHostEnvironment env, IPredictorModel[] predictors, IOutputCombiner combiner) + : base(env, predictors, combiner, LoaderSignature, MetadataUtils.Const.ScoreColumnKind.BinaryClassification) + { + Host.Assert(_scoreColumnKind == MetadataUtils.Const.ScoreColumnKind.BinaryClassification); + CheckBinaryLabel(true, Host, PredictorModels); + } + + public ImplOneWithCalibrator(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind) + : base(env, ctx, scoreColumnKind) + { + Host.Assert(_scoreColumnKind == MetadataUtils.Const.ScoreColumnKind.BinaryClassification); + CheckBinaryLabel(false, Host, PredictorModels); + } + + private static void CheckBinaryLabel(bool user, IHostEnvironment env, IPredictorModel[] predictors) + { + int classCount = CheckLabelColumn(env, predictors, true); + if (classCount != 2) + { + var error = string.Format("Expected label to have exactly 2 classes, instead has {0}", classCount); + throw user ? env.ExceptParam(nameof(predictors), error) : env.ExceptDecode(error); + } + } + + public IPredictor Calibrate(IChannel ch, IDataView data, ICalibratorTrainer caliTrainer, int maxRows) + { + Host.CheckValue(ch, nameof(ch)); + ch.CheckValue(data, nameof(data)); + ch.CheckValue(caliTrainer, nameof(caliTrainer)); + + if (caliTrainer.NeedsTraining) + { + var bound = new Bound(this, RoleMappedSchema.Create(data.Schema)); + using (var curs = data.GetRowCursor(col => true)) + { + var scoreGetter = (ValueGetter)bound.CreateScoreGetter(curs, col => true, out Action disposer); + + // We assume that we can use the label column of the first predictor, since if the labels are not identical + // then the whole model is garbage anyway. + var labelGetter = bound.GetLabelGetter(curs, 0, out Action disp); + disposer += disp; + var weightGetter = bound.GetWeightGetter(curs, 0, out disp); + disposer += disp; + try + { + int num = 0; + while (curs.MoveNext()) + { + Single label = 0; + labelGetter(ref label); + if (!FloatUtils.IsFinite(label)) + continue; + Single score = 0; + scoreGetter(ref score); + if (!FloatUtils.IsFinite(score)) + continue; + Single weight = 0; + weightGetter(ref weight); + if (!FloatUtils.IsFinite(weight)) + continue; + + caliTrainer.ProcessTrainingExample(score, label > 0, weight); + + if (maxRows > 0 && ++num >= maxRows) + break; + } + } + finally + { + disposer?.Invoke(); + } + } + } + + var calibrator = caliTrainer.FinishTraining(ch); + return CalibratorUtils.CreateCalibratedPredictor(Host, this, calibrator); + } + } + + private readonly string[] _inputCols; + + protected readonly IHost Host; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "PIPELNEN", + //verWrittenCur: 0x00010001, // Initial + verWrittenCur: 0x00010002, // Save predictor models in a subdirectory + verReadableCur: 0x00010002, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature); + } + public const string UserName = "Pipeline Ensemble"; + public const string LoaderSignature = "PipelineEnsemble"; + + private readonly string _scoreColumnKind; + + protected abstract ColumnType ScoreType { get; } + + public abstract PredictionKind PredictionKind { get; } + + internal IPredictorModel[] PredictorModels { get; } + + private SchemaBindablePipelineEnsembleBase(IHostEnvironment env, IPredictorModel[] predictors, string registrationName, string scoreColumnKind) + { + Contracts.CheckValue(env, nameof(env)); + Host = env.Register(registrationName); + Host.CheckNonEmpty(predictors, nameof(predictors)); + Host.CheckNonWhiteSpace(scoreColumnKind, nameof(scoreColumnKind)); + + PredictorModels = predictors; + _scoreColumnKind = scoreColumnKind; + + HashSet inputCols = null; + for (int i = 0; i < predictors.Length; i++) + { + var predModel = predictors[i]; + + // Get the input column names. + var inputSchema = predModel.TransformModel.InputSchema; + if (inputCols == null) + { + inputCols = new HashSet(); + for (int j = 0; j < inputSchema.ColumnCount; j++) + { + if (inputSchema.IsHidden(j)) + continue; + inputCols.Add(inputSchema.GetColumnName(j)); + } + _inputCols = inputCols.ToArray(); + } + else + { + int nonHiddenCols = 0; + for (int j = 0; j < inputSchema.ColumnCount; j++) + { + if (inputSchema.IsHidden(j)) + continue; + var name = inputSchema.GetColumnName(j); + if (!inputCols.Contains(name)) + throw Host.Except("Inconsistent schemas: Some schemas do not contain the column '{0}'", name); + nonHiddenCols++; + } + Host.Check(nonHiddenCols == _inputCols.Length, + "Inconsistent schemas: not all schemas have the same number of columns"); + } + } + } + + protected SchemaBindablePipelineEnsembleBase(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind) + { + Host = env.Register(LoaderSignature); + Host.AssertNonEmpty(scoreColumnKind); + + _scoreColumnKind = scoreColumnKind; + + // *** Binary format *** + // int: id of _scoreColumnKind (loaded in the Create method) + // int: number of predictors + // The predictor models + // int: the number of input columns + // for each input column: + // int: id of the column name + + var length = ctx.Reader.ReadInt32(); + Host.CheckDecode(length > 0); + PredictorModels = new IPredictorModel[length]; + for (int i = 0; i < PredictorModels.Length; i++) + { + string dir = + ctx.Header.ModelVerWritten == 0x00010001 + ? "PredictorModels" + : Path.Combine(ctx.Directory, "PredictorModels"); + using (var ent = ctx.Repository.OpenEntry(dir, $"PredictorModel_{i:000}")) + PredictorModels[i] = new PredictorModel(Host, ent.Stream); + } + + length = ctx.Reader.ReadInt32(); + Host.CheckDecode(length >= 0); + _inputCols = new string[length]; + for (int i = 0; i < length; i++) + _inputCols[i] = ctx.LoadNonEmptyString(); + } + + public void Save(ModelSaveContext ctx) + { + Host.AssertValue(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: id of _scoreColumnKind (loaded in the Create method) + // int: number of predictors + // The predictor models + // int: the number of input columns + // for each input column: + // int: id of the column name + + ctx.SaveNonEmptyString(_scoreColumnKind); + + Host.AssertNonEmpty(PredictorModels); + ctx.Writer.Write(PredictorModels.Length); + + for (int i = 0; i < PredictorModels.Length; i++) + { + var dir = Path.Combine(ctx.Directory, "PredictorModels"); + using (var ent = ctx.Repository.CreateEntry(dir, $"PredictorModel_{i:000}")) + PredictorModels[i].Save(Host, ent.Stream); + } + + Contracts.AssertValue(_inputCols); + ctx.Writer.Write(_inputCols.Length); + foreach (var name in _inputCols) + ctx.SaveNonEmptyString(name); + + SaveCore(ctx); + } + + protected abstract void SaveCore(ModelSaveContext ctx); + + public static SchemaBindablePipelineEnsembleBase Create(IHostEnvironment env, IPredictorModel[] predictors, IOutputCombiner combiner, string scoreColumnKind) + { + switch (scoreColumnKind) + { + case MetadataUtils.Const.ScoreColumnKind.BinaryClassification: + var binaryCombiner = combiner as IOutputCombiner; + if (binaryCombiner == null) + throw env.Except("Combiner type incompatible with score column kind"); + return new ImplOneWithCalibrator(env, predictors, binaryCombiner); + case MetadataUtils.Const.ScoreColumnKind.Regression: + case MetadataUtils.Const.ScoreColumnKind.AnomalyDetection: + var regressionCombiner = combiner as IOutputCombiner; + if (regressionCombiner == null) + throw env.Except("Combiner type incompatible with score column kind"); + return new ImplOne(env, predictors, regressionCombiner, scoreColumnKind); + case MetadataUtils.Const.ScoreColumnKind.MultiClassClassification: + var vectorCombiner = combiner as IOutputCombiner>; + if (vectorCombiner == null) + throw env.Except("Combiner type incompatible with score column kind"); + return new ImplVec(env, predictors, vectorCombiner); + default: + throw env.Except("Unknown score kind"); + } + } + + public static SchemaBindablePipelineEnsembleBase Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + + var scoreColumnKind = ctx.LoadNonEmptyString(); + switch (scoreColumnKind) + { + case MetadataUtils.Const.ScoreColumnKind.BinaryClassification: + return new ImplOneWithCalibrator(env, ctx, scoreColumnKind); + case MetadataUtils.Const.ScoreColumnKind.Regression: + case MetadataUtils.Const.ScoreColumnKind.AnomalyDetection: + return new ImplOne(env, ctx, scoreColumnKind); + case MetadataUtils.Const.ScoreColumnKind.MultiClassClassification: + return new ImplVec(env, ctx, scoreColumnKind); + default: + throw env.Except("Unknown score kind"); + } + } + + public abstract ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema schema); + + public void SaveSummary(TextWriter writer, RoleMappedSchema schema) + { + for (int i = 0; i < PredictorModels.Length; i++) + { + writer.WriteLine("Partition model {0} summary:", i); + + if (!(PredictorModels[i].Predictor is ICanSaveSummary summaryModel)) + { + writer.WriteLine("Model of type {0}", PredictorModels[i].Predictor.GetType().Name); + continue; + } + + // Load the feature names for the i'th model. + var dv = new EmptyDataView(Host, PredictorModels[i].TransformModel.InputSchema); + PredictorModels[i].PrepareData(Host, dv, out RoleMappedData rmd, out IPredictor pred); + summaryModel.SaveSummary(writer, rmd.Schema); + } + } + + // Checks that the predictors have matching label columns, and returns the number of classes in all predictors. + protected static int CheckLabelColumn(IHostEnvironment env, IPredictorModel[] models, bool isBinary) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckNonEmpty(models, nameof(models)); + + var model = models[0]; + var edv = new EmptyDataView(env, model.TransformModel.InputSchema); + model.PrepareData(env, edv, out RoleMappedData rmd, out IPredictor pred); + var labelInfo = rmd.Schema.Label; + if (labelInfo == null) + throw env.Except("Training schema for model 0 does not have a label column"); + + var labelType = rmd.Schema.Schema.GetColumnType(rmd.Schema.Label.Index); + if (!labelType.IsKey) + return CheckNonKeyLabelColumnCore(env, pred, models, isBinary, labelType); + + if (isBinary && labelType.KeyCount != 2) + throw env.Except("Label is not binary"); + var schema = rmd.Schema.Schema; + var mdType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, labelInfo.Index); + if (mdType == null || !mdType.IsKnownSizeVector) + throw env.Except("Label column of type key must have a vector of key values metadata"); + + return Utils.MarshalInvoke(CheckKeyLabelColumnCore, mdType.ItemType.RawType, env, models, labelType.AsKey, schema, labelInfo.Index, mdType); + } + + // When the label column is not a key, we check that the number of classes is the same for all the predictors, by checking the + // OutputType property of the IValueMapper. + // If any of the predictors do not implement IValueMapper we throw an exception. Returns the class count. + private static int CheckNonKeyLabelColumnCore(IHostEnvironment env, IPredictor pred, IPredictorModel[] models, bool isBinary, ColumnType labelType) + { + env.Assert(!labelType.IsKey); + env.AssertNonEmpty(models); + + if (isBinary) + return 2; + + // The label is numeric, we just have to check that the number of classes is the same. + if (!(pred is IValueMapper vm)) + throw env.Except("Cannot determine the number of classes the predictor outputs"); + var classCount = vm.OutputType.VectorSize; + + for (int i = 1; i < models.Length; i++) + { + var model = models[i]; + var edv = new EmptyDataView(env, model.TransformModel.InputSchema); + model.PrepareData(env, edv, out RoleMappedData rmd, out pred); + vm = pred as IValueMapper; + if (vm.OutputType.VectorSize != classCount) + throw env.Except("Label of model {0} has different number of classes than model 0", i); + } + return classCount; + } + + // Checks that all the label columns of the model have the same key type as their label column - including the same + // cardinality and the same key values, and returns the cardinality of the label column key. + private static int CheckKeyLabelColumnCore(IHostEnvironment env, IPredictorModel[] models, KeyType labelType, ISchema schema, int labelIndex, ColumnType keyValuesType) + where T : IEquatable + { + env.Assert(keyValuesType.ItemType.RawType == typeof(T)); + env.AssertNonEmpty(models); + var labelNames = default(VBuffer); + schema.GetMetadata(MetadataUtils.Kinds.KeyValues, labelIndex, ref labelNames); + var classCount = labelNames.Length; + + var curLabelNames = default(VBuffer); + for (int i = 1; i < models.Length; i++) + { + var model = models[i]; + var edv = new EmptyDataView(env, model.TransformModel.InputSchema); + model.PrepareData(env, edv, out RoleMappedData rmd, out IPredictor pred); + var labelInfo = rmd.Schema.Label; + if (labelInfo == null) + throw env.Except("Training schema for model {0} does not have a label column", i); + + var curLabelType = rmd.Schema.Schema.GetColumnType(rmd.Schema.Label.Index); + if (!labelType.Equals(curLabelType.AsKey)) + throw env.Except("Label column of model {0} has different type than model 0", i); + + var mdType = rmd.Schema.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, labelInfo.Index); + if (!mdType.Equals(keyValuesType)) + throw env.Except("Label column of model {0} has different key value type than model 0", i); + rmd.Schema.Schema.GetMetadata(MetadataUtils.Kinds.KeyValues, labelInfo.Index, ref curLabelNames); + if (!AreEqual(ref labelNames, ref curLabelNames)) + throw env.Except("Label of model {0} has different values than model 0", i); + } + return classCount; + } + + private static bool AreEqual(ref VBuffer v1, ref VBuffer v2) + where T : IEquatable + { + if (v1.Length != v2.Length) + return false; + return v1.DenseValues().Zip(v2.DenseValues(), (x1, x2) => x1.Equals(x2)).All(b => b); + } + + /// + /// This method outputs a Key-Value Pair (kvp) per model in the ensemble. + /// * The key is the model number such as "Partition model 0 summary". If the model implements + /// then this string is followed by the first line of the model summary (the first line contains a description specific to the + /// model kind, such as "Feature gains" for FastTree or "Feature weights" for linear). + /// * The value: + /// - If the model implements then the value is the list of Key-Value pairs + /// containing the detailed summary for that model (for example, linear models have a list containing kvps where the keys + /// are the feature names and the values are the weights. FastTree has a similar list with the feature gains as values). + /// - If the model does not implement but does implement , + /// the value is a string containing the summary of that model. + /// - If neither of those interfaces are implemented then the value is a string containing the name of the type of model. + /// + /// + public IList> GetSummaryInKeyValuePairs(RoleMappedSchema schema) + { + Host.CheckValueOrNull(schema); + + var list = new List>(); + + var sb = new StringBuilder(); + for (int i = 0; i < PredictorModels.Length; i++) + { + var key = string.Format("Partition model {0} summary:", i); + var summaryKvps = PredictorModels[i].Predictor as ICanGetSummaryInKeyValuePairs; + var summaryModel = PredictorModels[i].Predictor as ICanSaveSummary; + if (summaryKvps == null && summaryModel == null) + { + list.Add(new KeyValuePair(key, PredictorModels[i].Predictor.GetType().Name)); + continue; + } + + // Load the feature names for the i'th model. + var dv = new EmptyDataView(Host, PredictorModels[i].TransformModel.InputSchema); + PredictorModels[i].PrepareData(Host, dv, out RoleMappedData rmd, out IPredictor pred); + + if (summaryModel != null) + { + sb.Clear(); + using (StringWriter sw = new StringWriter(sb)) + summaryModel.SaveSummary(sw, rmd.Schema); + } + + if (summaryKvps != null) + { + var listCur = summaryKvps.GetSummaryInKeyValuePairs(rmd.Schema); + if (summaryModel != null) + { + using (var reader = new StringReader(sb.ToString())) + { + string firstLine = null; + while (string.IsNullOrEmpty(firstLine)) + firstLine = reader.ReadLine(); + if (!string.IsNullOrEmpty(firstLine)) + key += ("\r\n" + firstLine); + } + } + list.Add(new KeyValuePair(key, listCur)); + } + else + { + Host.AssertValue(summaryModel); + list.Add(new KeyValuePair(key, sb.ToString())); + } + + } + return list; + } + + public string[] GetLabelNamesOrNull(out ColumnType labelType) + { + Host.AssertNonEmpty(PredictorModels); + return PredictorModels[0].GetLabelInfo(Host, out labelType); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs new file mode 100644 index 0000000000..b2f7bad432 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs @@ -0,0 +1,45 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; + +namespace Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure +{ + public abstract class BaseDisagreementDiversityMeasure : IDiversityMeasure + { + public List> CalculateDiversityMeasure(IList>> models, + ConcurrentDictionary>, TOutput[]> predictions) + { + Contracts.Assert(models.Count > 1); + Contracts.Assert(predictions.Count == models.Count); + + List> diversityValues = new List>(); + + for (int i = 0; i < (models.Count - 1); i++) + { + for (int j = i + 1; j < models.Count; j++) + { + Single differencesCount = 0; + var modelXOutputs = predictions[models[i]]; + var modelYOutputs = predictions[models[j]]; + for (int k = 0; k < modelXOutputs.Length; k++) + { + differencesCount += GetDifference(ref modelXOutputs[k], ref modelYOutputs[k]); + } + diversityValues.Add(new ModelDiversityMetric() + { + DiversityNumber = differencesCount, + ModelX = models[i], + ModelY = models[j] + }); + } + } + return diversityValues; + } + + protected abstract Single GetDifference(ref TOutput tOutput1, ref TOutput tOutput2); + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/DisagreementDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/DisagreementDiversityMeasure.cs new file mode 100644 index 0000000000..8c41953d6b --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/DisagreementDiversityMeasure.cs @@ -0,0 +1,25 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; + +[assembly: LoadableClass(typeof(DisagreementDiversityMeasure), null, typeof(SignatureEnsembleDiversityMeasure), + DisagreementDiversityMeasure.UserName, DisagreementDiversityMeasure.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure +{ + public class DisagreementDiversityMeasure : BaseDisagreementDiversityMeasure + { + public const string UserName = "Disagreement Diversity Measure"; + public const string LoadName = "DisagreementDiversityMeasure"; + + protected override Single GetDifference(ref Single valueX, ref Single valueY) + { + return (valueX > 0 && valueY < 0 || valueX < 0 && valueY > 0) ? 1 : 0; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/ModelDiversityMetric.cs b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/ModelDiversityMetric.cs new file mode 100644 index 0000000000..1ee03a9489 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/ModelDiversityMetric.cs @@ -0,0 +1,15 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure +{ + public class ModelDiversityMetric + { + public FeatureSubsetModel> ModelX { get; set; } + public FeatureSubsetModel> ModelY { get; set; } + public Single DiversityNumber { get; set; } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/MultiDisagreementDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/MultiDisagreementDiversityMeasure.cs new file mode 100644 index 0000000000..1ccf9b2f65 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/MultiDisagreementDiversityMeasure.cs @@ -0,0 +1,26 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.Numeric; + +[assembly: LoadableClass(typeof(MultiDisagreementDiversityMeasure), null, typeof(SignatureEnsembleDiversityMeasure), + DisagreementDiversityMeasure.UserName, MultiDisagreementDiversityMeasure.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure +{ + public class MultiDisagreementDiversityMeasure : BaseDisagreementDiversityMeasure> + { + public const string LoadName = "MultiDisagreementDiversityMeasure"; + + protected override Single GetDifference(ref VBuffer valueX, ref VBuffer valueY) + { + return (VectorUtils.ArgMax(ref valueX) != VectorUtils.ArgMax(ref valueY)) ? 1 : 0; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/RegressionDisagreementDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/RegressionDisagreementDiversityMeasure.cs new file mode 100644 index 0000000000..5a55545fe0 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/RegressionDisagreementDiversityMeasure.cs @@ -0,0 +1,24 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; + +[assembly: LoadableClass(typeof(RegressionDisagreementDiversityMeasure), null, typeof(SignatureEnsembleDiversityMeasure), + DisagreementDiversityMeasure.UserName, RegressionDisagreementDiversityMeasure.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure +{ + public class RegressionDisagreementDiversityMeasure : BaseDisagreementDiversityMeasure + { + public const string LoadName = "RegressionDisagreementDiversityMeasure"; + + protected override Single GetDifference(ref Single valueX, ref Single valueY) + { + return Math.Abs(valueX - valueY); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs new file mode 100644 index 0000000000..be532901a6 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs @@ -0,0 +1,28 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; + +[assembly: LoadableClass(typeof(AllFeatureSelector), null, typeof(SignatureEnsembleFeatureSelector), + AllFeatureSelector.UserName, AllFeatureSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector +{ + public sealed class AllFeatureSelector : IFeatureSelector + { + public const string UserName = "All Feature Selector"; + public const string LoadName = "AllFeatureSelector"; + + public AllFeatureSelector(IHostEnvironment env) + { + } + + public Subset SelectFeatures(RoleMappedData data, IRandom rand) + { + return new Subset(data); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs new file mode 100644 index 0000000000..d239d8a23e --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs @@ -0,0 +1,58 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; +using Microsoft.ML.Runtime.Training; + +[assembly: LoadableClass(typeof(RandomFeatureSelector), typeof(RandomFeatureSelector.Arguments), + typeof(SignatureEnsembleFeatureSelector), RandomFeatureSelector.UserName, RandomFeatureSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector +{ + public class RandomFeatureSelector : IFeatureSelector + { + public const string UserName = "Random Feature Selector"; + public const string LoadName = "RandomFeatureSelector"; + + public class Arguments + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of features to be selected. The range is 0.0-1.0", ShortName = "fp", SortOrder = 50)] + public Single FeaturesSelectionProportion = 0.8f; + } + + private readonly Arguments _args; + private readonly IHost _host; + + public RandomFeatureSelector(IHostEnvironment env, Arguments args) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(args, nameof(args)); + + _host = env.Register(LoadName); + _args = args; + _host.Check(0 < _args.FeaturesSelectionProportion && _args.FeaturesSelectionProportion < 1, + "The feature proportion for RandomFeatureSelector should be greater than 0 and lesser than 1"); + } + + public Subset SelectFeatures(RoleMappedData data, IRandom rand) + { + _host.CheckValue(data, nameof(data)); + data.CheckFeatureFloatVector(); + + var type = data.Schema.Feature.Type; + int len = type.VectorSize; + var features = new BitArray(len); + for (int j = 0; j < len; j++) + features[j] = rand.NextDouble() < _args.FeaturesSelectionProportion; + var dataNew = EnsembleUtils.SelectFeatures(_host, data, features); + return new Subset(dataNew, features); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs new file mode 100644 index 0000000000..af7ee1600a --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Concurrent; +using System.Collections.Generic; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; + +namespace Microsoft.ML.Runtime.Ensemble.Selector +{ + public interface IDiversityMeasure + { + List> CalculateDiversityMeasure(IList>> models, + ConcurrentDictionary>, TOutput[]> predictions); + } + + public delegate void SignatureEnsembleDiversityMeasure(); +} diff --git a/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs new file mode 100644 index 0000000000..1a046fe0f4 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs @@ -0,0 +1,15 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; + +namespace Microsoft.ML.Runtime.Ensemble.Selector +{ + public interface IFeatureSelector + { + Subset SelectFeatures(RoleMappedData data, IRandom rand); + } + + public delegate void SignatureEnsembleFeatureSelector(); +} diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs new file mode 100644 index 0000000000..f80a2d4290 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs @@ -0,0 +1,29 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.Runtime.Ensemble.Selector +{ + public interface ISubModelSelector + { + IList>> Prune(IList>> models); + + void CalculateMetrics(FeatureSubsetModel> model, ISubsetSelector subsetSelector, Subset subset, + Batch batch, bool needMetrics); + + Single ValidationDatasetProportion { get; } + } + + public interface IRegressionSubModelSelector : ISubModelSelector + { + } + + public interface IBinarySubModelSelector : ISubModelSelector + { + } + + public delegate void SignatureEnsembleSubModelSelector(); +} diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs new file mode 100644 index 0000000000..90209f5391 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs @@ -0,0 +1,20 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Runtime.Data; + +namespace Microsoft.ML.Runtime.Ensemble.Selector +{ + public interface ISubsetSelector + { + void Initialize(RoleMappedData data, int size, int batchSize, Single validationDatasetProportion); + IEnumerable GetBatches(IRandom rand); + IEnumerable GetSubsets(Batch batch, IRandom rand); + RoleMappedData GetTestData(Subset subset, Batch batch); + } + + public delegate void SignatureEnsembleDataSelector(); +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs new file mode 100644 index 0000000000..cc3586b184 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs @@ -0,0 +1,28 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; + +[assembly: LoadableClass(typeof(AllSelector), null, typeof(SignatureEnsembleSubModelSelector), AllSelector.UserName, AllSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public class AllSelector : BaseSubModelSelector, IBinarySubModelSelector, IRegressionSubModelSelector + { + public const string UserName = "All Selector"; + public const string LoadName = "AllSelector"; + + public override Single ValidationDatasetProportion { get { return 0; } } + + protected override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } } + + public AllSelector(IHostEnvironment env) + : base(env, LoadName) + { + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs new file mode 100644 index 0000000000..ab9509c637 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs @@ -0,0 +1,30 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; + +[assembly: LoadableClass(typeof(AllSelectorMultiClass), null, typeof(SignatureEnsembleSubModelSelector), + AllSelectorMultiClass.UserName, AllSelectorMultiClass.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public class AllSelectorMultiClass : BaseSubModelSelector> + { + public const string UserName = "All Selector"; + public const string LoadName = "AllSelectorMultiClass"; + + public override Single ValidationDatasetProportion { get { return 0; } } + + protected override PredictionKind PredictionKind { get { return PredictionKind.MultiClassClassification; } } + + public AllSelectorMultiClass(IHostEnvironment env) + : base(env, LoadName) + { + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs new file mode 100644 index 0000000000..3126a82885 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs @@ -0,0 +1,124 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Reflection; +using Microsoft.ML.Runtime.CommandLine; + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public abstract class BaseBestPerformanceSelector : SubModelDataSelector + { + protected abstract string MetricName { get; } + + protected virtual bool IsAscMetric + { + get { return true; } + } + + protected BaseBestPerformanceSelector(ArgumentsBase args, IHostEnvironment env, string name) + : base(args, env, name) + { + } + + public override void CalculateMetrics(FeatureSubsetModel> model, + ISubsetSelector subsetSelector, Subset subset, Batch batch, bool needMetrics) + { + base.CalculateMetrics(model, subsetSelector, subset, batch, true); + } + + public override IList>> Prune(IList>> models) + { + using (var ch = Host.Start("Pruning")) + { + var sortedModels = models.ToArray(); + Array.Sort(sortedModels, new ModelPerformanceComparer(MetricName, IsAscMetric)); + Print(ch, sortedModels, MetricName); + int modelCountToBeSelected = (int)(models.Count * LearnersSelectionProportion); + if (modelCountToBeSelected == 0) + modelCountToBeSelected = 1; + + var retval = sortedModels.Where(m => m != null).Take(modelCountToBeSelected).ToList(); + ch.Done(); + return retval; + } + } + + protected static string FindMetricName(Type type, object value) + { + Contracts.Assert(type.IsEnum); + Contracts.Assert(value.GetType() == type); + + foreach (var field in type.GetFields(BindingFlags.Public | BindingFlags.Static | BindingFlags.DeclaredOnly)) + { + if (field.FieldType != type) + continue; + if (field.GetCustomAttribute() != null) + continue; + var displayAttr = field.GetCustomAttribute(); + if (displayAttr != null) + { + var valCur = field.GetValue(null); + if (value.Equals(valCur)) + return displayAttr.Name; + } + } + Contracts.Assert(false); + return null; + } + + private sealed class ModelPerformanceComparer : IComparer>> + { + private readonly string _metricName; + private readonly bool _isAscMetric; + + public ModelPerformanceComparer(string metricName, bool isAscMetric) + { + Contracts.AssertValue(metricName); + + _metricName = metricName; + _isAscMetric = isAscMetric; + } + + public int Compare(FeatureSubsetModel> x, FeatureSubsetModel> y) + { + if (x == null || y == null) + return (x == null ? 0 : 1) - (y == null ? 0 : 1); + double xValue = 0; + var found = false; + foreach (var kvp in x.Metrics) + { + if (_metricName == kvp.Key) + { + xValue = kvp.Value; + found = true; + break; + } + } + if (!found) + throw Contracts.Except("Metrics did not contain the requested metric '{0}'", _metricName); + double yValue = 0; + found = false; + foreach (var kvp in y.Metrics) + { + if (_metricName == kvp.Key) + { + yValue = kvp.Value; + found = true; + break; + } + } + if (!found) + throw Contracts.Except("Metrics did not contain the requested metric '{0}'", _metricName); + if (xValue > yValue) + return _isAscMetric ? -1 : 1; + if (yValue > xValue) + return _isAscMetric ? 1 : -1; + return 0; + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs new file mode 100644 index 0000000000..f11d687314 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs @@ -0,0 +1,144 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Training; + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public abstract class BaseDiverseSelector : SubModelDataSelector + where TDiversityMetric : class, IDiversityMeasure + { + public sealed class Arguments : ArgumentsBase + { + [Argument(ArgumentType.Multiple, HelpText = "The metric type to be used to find the diversity among base learners", ShortName = "dm", SortOrder = 50)] + [TGUI(Label = "Diversity Measure Type")] + public SubComponent DiversityMetricType; + } + + private readonly SubComponent _diversityMetricType; + private ConcurrentDictionary>, TOutput[]> _predictions; + + public abstract string DiversityMeasureLoadname { get; } + + protected internal BaseDiverseSelector(IHostEnvironment env, Arguments args, string name) + : base(args, env, name) + { + _diversityMetricType = args.DiversityMetricType; + _predictions = new ConcurrentDictionary>, TOutput[]>(); + } + + protected TDiversityMetric CreateDiversityMetric() + { + if (!_diversityMetricType.IsGood()) + { + var sc = new SubComponent(DiversityMeasureLoadname); + return sc.CreateInstance(Host); + } + return _diversityMetricType.CreateInstance(Host); + } + + public override void CalculateMetrics(FeatureSubsetModel> model, + ISubsetSelector subsetSelector, Subset subset, Batch batch, bool needMetrics) + { + base.CalculateMetrics(model, subsetSelector, subset, batch, needMetrics); + + var vm = model.Predictor as IValueMapper; + Host.Check(vm != null, "Predictor doesn't implement the expected interface"); + var map = vm.GetMapper, TOutput>(); + + TOutput[] preds = new TOutput[100]; + int count = 0; + var data = subsetSelector.GetTestData(subset, batch); + using (var cursor = new FeatureFloatVectorCursor(data, CursOpt.AllFeatures)) + { + while (cursor.MoveNext()) + { + Utils.EnsureSize(ref preds, count + 1); + map(ref cursor.Features, ref preds[count]); + count++; + } + } + Array.Resize(ref preds, count); + _predictions[model] = preds; + } + + /// + /// This calculates the diversity by calculating the disagreement measure which is defined as the sum of number of instances correctly(incorrectly) + /// classified by first classifier and incorrectly(correctly) classified by the second classifier over the total number of instances. + /// All the pairwise classifiers are sorted out to take the most divers classifiers. + /// + /// + /// + public override IList>> Prune(IList>> models) + { + if (models.Count <= 1) + return models; + + // 1. Find the disagreement number + List> diversityValues = CalculateDiversityMeasure(models, _predictions); + _predictions.Clear(); + + // 2. Sort all the pairwise classifiers + var sortedModels = diversityValues.ToArray(); + Array.Sort(sortedModels, new ModelDiversityComparer()); + var modelCountToBeSelected = (int)(models.Count * LearnersSelectionProportion); + + if (modelCountToBeSelected == 0) + modelCountToBeSelected++; + + // 3. Take the most diverse classifiers + var selectedModels = new List>>(); + foreach (var item in sortedModels) + { + if (selectedModels.Count < modelCountToBeSelected) + { + if (!selectedModels.Contains(item.ModelX)) + { + selectedModels.Add(item.ModelX); + } + } + + if (selectedModels.Count < modelCountToBeSelected) + { + if (!selectedModels.Contains(item.ModelY)) + { + selectedModels.Add(item.ModelY); + continue; + } + } + else + { + break; + } + } + + return selectedModels; + } + + public abstract List> CalculateDiversityMeasure(IList>> models, + ConcurrentDictionary>, TOutput[]> predictions); + + public class ModelDiversityComparer : IComparer> + { + public int Compare(ModelDiversityMetric x, ModelDiversityMetric y) + { + if (x == null || y == null) + return 0; + if (x.DiversityNumber > y.DiversityNumber) + return -1; + if (y.DiversityNumber > x.DiversityNumber) + return 1; + return 0; + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs new file mode 100644 index 0000000000..518250bbf5 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs @@ -0,0 +1,137 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public abstract class BaseSubModelSelector : ISubModelSelector + { + protected readonly IHost Host; + + public abstract Single ValidationDatasetProportion { get; } + + protected abstract PredictionKind PredictionKind { get; } + + protected BaseSubModelSelector(IHostEnvironment env, string name) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckNonWhiteSpace(name, nameof(name)); + Host = env.Register(name); + } + + protected void Print(IChannel ch, IList>> models, string metricName) + { + // REVIEW tfinley: The output format was faithfully reproduced from the original format, but it's unclear + // to me that this is right. Why have two bars in the header line, but only one bar in the results? + ch.Info("List of models and the metrics after sorted"); + ch.Info("| {0}(Sorted) || Name of Model |", metricName); + foreach (var model in models) + { + var metric = 0.0; + var found = false; + foreach (var kvp in model.Metrics) + { + if (kvp.Key == metricName) + { + metric = kvp.Value; + found = true; + } + } + if (!found) + throw ch.Except("Metrics did not contain the requested metric '{0}'", metricName); + ch.Info("| {0} |{1}", metric, model.Predictor.GetType().Name); + } + } + + public virtual IList>> Prune(IList>> models) + { + return models; + } + + private SubComponent GetEvaluatorSubComponent() + { + switch (PredictionKind) + { + case PredictionKind.BinaryClassification: + return new SubComponent(BinaryClassifierEvaluator.LoadName); + case PredictionKind.Regression: + return new SubComponent(RegressionEvaluator.LoadName); + case PredictionKind.MultiClassClassification: + return new SubComponent(MultiClassClassifierEvaluator.LoadName); + default: + throw Host.Except("Unrecognized prediction kind '{0}'", PredictionKind); + } + } + + public virtual void CalculateMetrics(FeatureSubsetModel> model, + ISubsetSelector subsetSelector, Subset subset, Batch batch, bool needMetrics) + { + if (!needMetrics || model == null || model.Metrics != null) + return; + + using (var ch = Host.Start("Calculate metrics")) + { + RoleMappedData testData = subsetSelector.GetTestData(subset, batch); + // Because the training and test datasets are drawn from the same base dataset, the test data role mappings + // are the same as for the train data. + IDataScorerTransform scorePipe = ScoreUtils.GetScorer(model.Predictor, testData, Host, testData.Schema); + // REVIEW tfinley: Should we somehow allow the user to customize the evaluator? + // By what mechanism should we allow that? + var evalComp = GetEvaluatorSubComponent(); + RoleMappedData scoredTestData = RoleMappedData.Create(scorePipe, + GetColumnRoles(testData.Schema, scorePipe.Schema)); + IEvaluator evaluator = evalComp.CreateInstance(Host); + // REVIEW yaeld: with the new evaluators, metrics of individual models are no longer + // printed to the Console. Consider adding an option on the combiner to print them. + // REVIEW yaeld(petelu): Consider adding an option to the combiner to save a data view + // containing all the results of the individual models. + var metricsDict = evaluator.Evaluate(scoredTestData); + if (!metricsDict.TryGetValue(MetricKinds.OverallMetrics, out IDataView metricsView)) + throw Host.Except("Evaluator did not produce any overall metrics"); + // REVIEW tfinley: We're assuming that the metrics of interest are always doubles here. + var metrics = EvaluateUtils.GetMetrics(metricsView, getVectorMetrics: false); + model.Metrics = metrics.ToArray(); + ch.Done(); + } + } + + private IEnumerable> GetColumnRoles( + RoleMappedSchema testSchema, ISchema scoredSchema) + { + switch (PredictionKind) + { + case PredictionKind.BinaryClassification: + yield return RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, testSchema.Label.Name); + var scoreInfo = EvaluateUtils.GetScoreColumnInfo(Host, scoredSchema, null, nameof(BinaryClassifierMamlEvaluator.ArgumentsBase.ScoreColumn), + MetadataUtils.Const.ScoreColumnKind.BinaryClassification); + yield return RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Score, scoreInfo.Name); + // Get the optional probability column. + var probInfo = EvaluateUtils.GetOptAuxScoreColumnInfo(Host, scoredSchema, null, nameof(BinaryClassifierMamlEvaluator.Arguments.ProbabilityColumn), + scoreInfo.Index, MetadataUtils.Const.ScoreValueKind.Probability, t => t == NumberType.Float); + if (probInfo != null) + yield return RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Probability, probInfo.Name); + yield break; + case PredictionKind.Regression: + yield return RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, testSchema.Label.Name); + scoreInfo = EvaluateUtils.GetScoreColumnInfo(Host, scoredSchema, null, nameof(RegressionMamlEvaluator.Arguments.ScoreColumn), + MetadataUtils.Const.ScoreColumnKind.Regression); + yield return RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Score, scoreInfo.Name); + yield break; + case PredictionKind.MultiClassClassification: + yield return RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, testSchema.Label.Name); + scoreInfo = EvaluateUtils.GetScoreColumnInfo(Host, scoredSchema, null, nameof(MultiClassMamlEvaluator.Arguments.ScoreColumn), + MetadataUtils.Const.ScoreColumnKind.MultiClassClassification); + yield return RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Score, scoreInfo.Name); + yield break; + default: + throw Host.Except("Unrecognized prediction kind '{0}'", PredictionKind); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs new file mode 100644 index 0000000000..917a957190 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs @@ -0,0 +1,46 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; + +[assembly: LoadableClass(typeof(BestDiverseSelectorBinary), typeof(BestDiverseSelectorBinary.Arguments), + typeof(SignatureEnsembleSubModelSelector), BestDiverseSelectorBinary.UserName, BestDiverseSelectorBinary.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + using TScalarPredictor = IPredictorProducing; + public sealed class BestDiverseSelectorBinary : BaseDiverseSelector, IBinarySubModelSelector + { + public const string UserName = "Best Diverse Selector"; + public const string LoadName = "BestDiverseSelector"; + + public override string DiversityMeasureLoadname + { + get { return DisagreementDiversityMeasure.LoadName; } + } + + public BestDiverseSelectorBinary(IHostEnvironment env, Arguments args) + : base(env, args, LoadName) + { + } + + public override List> CalculateDiversityMeasure(IList> models, + ConcurrentDictionary, Single[]> predictions) + { + var diversityMetric = CreateDiversityMetric(); + return diversityMetric.CalculateDiversityMeasure(models, predictions); + } + + protected override PredictionKind PredictionKind + { + get { return PredictionKind.BinaryClassification; } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs new file mode 100644 index 0000000000..d95b10ff51 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -0,0 +1,50 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; + +[assembly: LoadableClass(typeof(BestDiverseSelectorMultiClass), typeof(BestDiverseSelectorMultiClass.Arguments), + typeof(SignatureEnsembleSubModelSelector), BestDiverseSelectorMultiClass.UserName, BestDiverseSelectorMultiClass.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + using TVectorPredictor = IPredictorProducing>; + public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector, IDiversityMeasure>> + { + public const string UserName = "Best Diverse Selector"; + public const string LoadName = "BestDiverseSelectorMultiClass"; + + public override string DiversityMeasureLoadname + { + get { return MultiDisagreementDiversityMeasure.LoadName; } + } + + public BestDiverseSelectorMultiClass(IHostEnvironment env, Arguments args) + : base(env, args, LoadName) + { + } + + protected override PredictionKind PredictionKind + { + get { return PredictionKind.MultiClassClassification; } + } + + public override List>> CalculateDiversityMeasure(IList> models, + ConcurrentDictionary, VBuffer[]> predictions) + { + Host.Assert(models.Count > 1); + Host.Assert(predictions.Count == models.Count); + + var diversityMetric = CreateDiversityMetric(); + return diversityMetric.CalculateDiversityMeasure(models, predictions); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs new file mode 100644 index 0000000000..02466599fe --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs @@ -0,0 +1,46 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; + +[assembly: LoadableClass(typeof(BestDiverseSelectorRegression), typeof(BestDiverseSelectorRegression.Arguments), + typeof(SignatureEnsembleSubModelSelector), BestDiverseSelectorRegression.UserName, BestDiverseSelectorRegression.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + using TScalarPredictor = IPredictorProducing; + public sealed class BestDiverseSelectorRegression : BaseDiverseSelector, IRegressionSubModelSelector + { + public const string UserName = "Best Diverse Selector"; + public const string LoadName = "BestDiverseSelectorRegression"; + + public override string DiversityMeasureLoadname + { + get { return RegressionDisagreementDiversityMeasure.LoadName; } + } + + public BestDiverseSelectorRegression(IHostEnvironment env, Arguments args) + : base(env, args, LoadName) + { + } + + public override List> CalculateDiversityMeasure(IList> models, + ConcurrentDictionary, Single[]> predictions) + { + var diversityMetric = CreateDiversityMetric(); + return diversityMetric.CalculateDiversityMeasure(models, predictions); + } + + protected override PredictionKind PredictionKind + { + get { return PredictionKind.Regression; } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs new file mode 100644 index 0000000000..5ed9afacb9 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs @@ -0,0 +1,57 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Internal.Internallearn; + +[assembly: LoadableClass(typeof(BestPerformanceRegressionSelector), typeof(BestPerformanceRegressionSelector.Arguments), + typeof(SignatureEnsembleSubModelSelector), BestPerformanceRegressionSelector.UserName, BestPerformanceRegressionSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public sealed class BestPerformanceRegressionSelector : BaseBestPerformanceSelector, IRegressionSubModelSelector + { + public sealed class Arguments : ArgumentsBase + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] + [TGUI(Label = "Metric Name")] + public RegressionEvaluator.Metrics MetricName = RegressionEvaluator.Metrics.L1; + } + public const string UserName = "Best Performance Selector"; + public const string LoadName = "BestPerformanceRegressionSelector"; + + private readonly RegressionEvaluator.Metrics _metric; + + private readonly string _metricName; + + public BestPerformanceRegressionSelector(IHostEnvironment env, Arguments args) + : base(args, env, LoadName) + { + Host.CheckUserArg(Enum.IsDefined(typeof(RegressionEvaluator.Metrics), args.MetricName), nameof(args.MetricName), "Undefined metric name"); + _metric = args.MetricName; + _metricName = FindMetricName(typeof(RegressionEvaluator.Metrics), _metric); + Host.Assert(!string.IsNullOrEmpty(_metricName)); + } + + protected override string MetricName + { + get { return _metricName; } + } + + protected override bool IsAscMetric + { + get { return false; } + } + + protected override PredictionKind PredictionKind + { + get { return PredictionKind.Regression; } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs new file mode 100644 index 0000000000..c8047cabfb --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs @@ -0,0 +1,58 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Internal.Internallearn; + +[assembly: LoadableClass(typeof(BestPerformanceSelector), typeof(BestPerformanceSelector.Arguments), + typeof(SignatureEnsembleSubModelSelector), BestPerformanceSelector.UserName, BestPerformanceSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public sealed class BestPerformanceSelector : BaseBestPerformanceSelector, IBinarySubModelSelector + { + public sealed class Arguments : ArgumentsBase + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] + [TGUI(Label = "Metric Name")] + public BinaryClassifierEvaluator.Metrics MetricName = BinaryClassifierEvaluator.Metrics.Auc; + } + + public const string UserName = "Best Performance Selector"; + public const string LoadName = "BestPerformanceSelector"; + + private readonly BinaryClassifierEvaluator.Metrics _metric; + private readonly string _metricName; + + public BestPerformanceSelector(IHostEnvironment env, Arguments args) + : base(args, env, LoadName) + { + Host.CheckUserArg(Enum.IsDefined(typeof(BinaryClassifierEvaluator.Metrics), args.MetricName), + nameof(args.MetricName), "Undefined metric name"); + _metric = args.MetricName; + _metricName = FindMetricName(typeof(BinaryClassifierEvaluator.Metrics), _metric); + Host.Assert(!string.IsNullOrEmpty(_metricName)); + } + + protected override string MetricName + { + get { return _metricName; } + } + + protected override bool IsAscMetric + { + get { return _metric != BinaryClassifierEvaluator.Metrics.LogLoss; } + } + + protected override PredictionKind PredictionKind + { + get { return PredictionKind.BinaryClassification; } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs new file mode 100644 index 0000000000..760482de23 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs @@ -0,0 +1,59 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Internal.Internallearn; + +[assembly: LoadableClass(typeof(BestPerformanceSelectorMultiClass), typeof(BestPerformanceSelectorMultiClass.Arguments), + typeof(SignatureEnsembleSubModelSelector), BestPerformanceSelectorMultiClass.UserName, BestPerformanceSelectorMultiClass.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public class BestPerformanceSelectorMultiClass : BaseBestPerformanceSelector> + { + public sealed class Arguments : ArgumentsBase + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] + [TGUI(Label = "Metric Name")] + public MultiClassClassifierEvaluator.Metrics MetricName = MultiClassClassifierEvaluator.Metrics.AccuracyMicro; + } + + public const string UserName = "Best Performance Selector"; + public const string LoadName = "BestPerformanceSelectorMultiClass"; + + private readonly MultiClassClassifierEvaluator.Metrics _metric; + private readonly string _metricName; + + public BestPerformanceSelectorMultiClass(IHostEnvironment env, Arguments args) + : base(args, env, LoadName) + { + Host.CheckUserArg(Enum.IsDefined(typeof(MultiClassClassifierEvaluator.Metrics), args.MetricName), + nameof(args.MetricName), "Undefined metric name"); + _metric = args.MetricName; + _metricName = FindMetricName(typeof(MultiClassClassifierEvaluator.Metrics), _metric); + Host.Assert(!string.IsNullOrEmpty(_metricName)); + } + + protected override PredictionKind PredictionKind + { + get { return PredictionKind.MultiClassClassification; } + } + + protected override bool IsAscMetric + { + get { return _metric != MultiClassClassifierEvaluator.Metrics.LogLoss; } + } + + protected override string MetricName + { + get { return _metricName; } + } + } + +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs new file mode 100644 index 0000000000..f3715ec353 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs @@ -0,0 +1,44 @@ +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Internal.Internallearn; +using System; +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector +{ + public abstract class SubModelDataSelector : BaseSubModelSelector + { + public abstract class ArgumentsBase + { + [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of best base learners to be selected. The range is 0.0-1.0", ShortName = "lp", SortOrder = 50)] + [TGUI(Label = "Learners Selection Proportion")] + public Single LearnersSelectionProportion = 0.5f; + + [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", ShortName = "vp", SortOrder = 50)] + [TGUI(Label = "Validation Dataset Proportion")] + public Single ValidationDatasetProportion = 0.3f; + } + + private readonly Single _learnersSelectionProportion; + private readonly Single _validationDatasetProportion; + + public Single LearnersSelectionProportion { get { return _learnersSelectionProportion; } } + + public override Single ValidationDatasetProportion { get { return _validationDatasetProportion; } } + + protected SubModelDataSelector(ArgumentsBase args, IHostEnvironment env, string name) + : base(env, name) + { + Host.CheckValue(args, nameof(args)); + Host.CheckParam(0 <= args.ValidationDatasetProportion && args.ValidationDatasetProportion < 1, + nameof(args.ValidationDatasetProportion), + "Should be greater than or equal to 0 and less than 1"); + Host.CheckParam(0 < args.LearnersSelectionProportion && args.LearnersSelectionProportion < 1, + nameof(args.LearnersSelectionProportion), + "Should be greater than 0 and less than 1"); + _learnersSelectionProportion = args.LearnersSelectionProportion; + _validationDatasetProportion = args.ValidationDatasetProportion; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs new file mode 100644 index 0000000000..7bd84c914f --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; + +[assembly: LoadableClass(typeof(AllInstanceSelector), typeof(AllInstanceSelector.Arguments), + typeof(SignatureEnsembleDataSelector), AllInstanceSelector.UserName, AllInstanceSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector +{ + public sealed class AllInstanceSelector : BaseSubsetSelector + { + public const string UserName = "All Instance Selector"; + public const string LoadName = "AllInstanceSelector"; + + public sealed class Arguments : ArgumentsBase + { + } + + public AllInstanceSelector(IHostEnvironment env, Arguments args) + : base(args, env, LoadName) + { + } + + public override IEnumerable GetSubsets(Batch batch, IRandom rand) + { + for (int i = 0; i < Size; i++) + yield return FeatureSelector.SelectFeatures(batch.TrainInstances, rand); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs new file mode 100644 index 0000000000..d3a6206502 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs @@ -0,0 +1,105 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector +{ + public abstract class BaseSubsetSelector : ISubsetSelector + where TArgs : BaseSubsetSelector.ArgumentsBase + { + public abstract class ArgumentsBase + { + [Argument(ArgumentType.Multiple, HelpText = "The Feature selector", ShortName = "fs", SortOrder = 1)] + public SubComponent FeatureSelector = + new SubComponent(AllFeatureSelector.LoadName); + } + + protected readonly IHost Host; + protected readonly TArgs Args; + protected readonly IFeatureSelector FeatureSelector; + + protected int Size; + protected RoleMappedData Data; + protected int BatchSize; + protected Single ValidationDatasetProportion; + + protected BaseSubsetSelector(TArgs args, IHostEnvironment env, string name) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(args, nameof(args)); + env.CheckNonWhiteSpace(name, nameof(name)); + + Host = env.Register(name); + Args = args; + FeatureSelector = Args.FeatureSelector.CreateInstance(Host); + } + + public void Initialize(RoleMappedData data, int size, int batchSize, Single validationDatasetProportion) + { + Host.CheckValue(data, nameof(data)); + Host.CheckParam(size > 0, nameof(size)); + Host.CheckParam(0 <= validationDatasetProportion && validationDatasetProportion < 1, + nameof(validationDatasetProportion), "Should be greater than or equal to 0 and less than 1"); + Data = data; + Size = size; + BatchSize = batchSize; + ValidationDatasetProportion = validationDatasetProportion; + } + + public abstract IEnumerable GetSubsets(Batch batch, IRandom rand); + + public IEnumerable GetBatches(IRandom rand) + { + Host.Assert(Data != null, "Must call Initialize first!"); + Host.AssertValue(rand); + + using (var ch = Host.Start("Getting batches")) + { + RoleMappedData dataTest; + RoleMappedData dataTrain; + + // Split the data, if needed. + if (!(ValidationDatasetProportion > 0)) + dataTest = dataTrain = Data; + else + { + // Split the data into train and test sets. + string name = Data.Data.Schema.GetTempColumnName(); + var args = new GenerateNumberTransform.Arguments(); + args.Column = new[] { new GenerateNumberTransform.Column() { Name = name } }; + args.Seed = (uint)rand.Next(); + var view = new GenerateNumberTransform(Host, args, Data.Data); + var viewTest = new RangeFilter(Host, new RangeFilter.Arguments() { Column = name, Max = ValidationDatasetProportion }, view); + var viewTrain = new RangeFilter(Host, new RangeFilter.Arguments() { Column = name, Max = ValidationDatasetProportion, Complement = true }, view); + dataTest = RoleMappedData.Create(viewTest, Data.Schema.GetColumnRoleNames()); + dataTrain = RoleMappedData.Create(viewTrain, Data.Schema.GetColumnRoleNames()); + } + + if (BatchSize > 0) + { + // REVIEW shonk: How should we carve the data into batches? + ch.Warning("Batch support is temporarily disabled"); + } + + yield return new Batch(dataTrain, dataTest); + ch.Done(); + } + } + + public virtual RoleMappedData GetTestData(Subset subset, Batch batch) + { + Host.CheckValueOrNull(subset); + Host.CheckValue(batch.TestInstances, nameof(batch), "Batch does not have test data"); + + if (subset == null || subset.SelectedFeatures == null) + return batch.TestInstances; + return EnsembleUtils.SelectFeatures(Host, batch.TestInstances, subset.SelectedFeatures); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs new file mode 100644 index 0000000000..cf1a659a64 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs @@ -0,0 +1,49 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; + +[assembly: LoadableClass(typeof(BootstrapSelector), typeof(BootstrapSelector.Arguments), + typeof(SignatureEnsembleDataSelector), BootstrapSelector.UserName, BootstrapSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector +{ + public sealed class BootstrapSelector : BaseSubsetSelector + { + public const string UserName = "Bootstrap Selector"; + public const string LoadName = "BootstrapSelector"; + + public class Arguments : ArgumentsBase + { + // REVIEW tfinley: This could be reintroduced by having the transform counting the + // proportions of each label, then adjusting the lambdas accordingly. However, at + // the current point in time supporting this non-default action is not considered + // a priority. +#if OLD_ENSEMBLE + [Argument(ArgumentType.AtMostOnce, HelpText = "If checked, the classes will be balanced by over sampling of minority classes", ShortName = "cb", SortOrder = 50)] + public bool balanced = false; +#endif + } + + public BootstrapSelector(IHostEnvironment env, Arguments args) + : base(args, env, LoadName) + { + } + + public override IEnumerable GetSubsets(Batch batch, IRandom rand) + { + for (int i = 0; i < Size; i++) + { + // REVIEW tfinley: Consider ways to reintroduce "balanced" samples. + var viewTrain = new BootstrapSampleTransform(Host, new BootstrapSampleTransform.Arguments(), Data.Data); + var dataTrain = RoleMappedData.Create(viewTrain, Data.Schema.GetColumnRoleNames()); + yield return FeatureSelector.SelectFeatures(dataTrain, rand); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs new file mode 100644 index 0000000000..876642bb00 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs @@ -0,0 +1,48 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; + +[assembly: LoadableClass(typeof(RandomPartitionSelector),typeof(RandomPartitionSelector.Arguments), + typeof(SignatureEnsembleDataSelector),RandomPartitionSelector.UserName, RandomPartitionSelector.LoadName)] + +namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector +{ + public sealed class RandomPartitionSelector : BaseSubsetSelector + { + public const string UserName = "Random Partition Selector"; + public const string LoadName = "RandomPartitionSelector"; + + public sealed class Arguments : ArgumentsBase + { + } + + public RandomPartitionSelector(IHostEnvironment env, Arguments args) + : base(args, env, LoadName) + { + } + + public override IEnumerable GetSubsets(Batch batch, IRandom rand) + { + string name = Data.Data.Schema.GetTempColumnName(); + var args = new GenerateNumberTransform.Arguments(); + args.Column = new[] { new GenerateNumberTransform.Column() { Name = name } }; + args.Seed = (uint)rand.Next(); + IDataTransform view = new GenerateNumberTransform(Host, args, Data.Data); + + // REVIEW shonk: This won't be very efficient when _size is large. + for (int i = 0; i < Size; i++) + { + var viewTrain = new RangeFilter(Host, new RangeFilter.Arguments() { Column = name, Min = (Double)i / Size, Max = (Double)(i + 1) / Size }, view); + var dataTrain = RoleMappedData.Create(viewTrain, Data.Schema.GetColumnRoleNames()); + yield return FeatureSelector.SelectFeatures(dataTrain, rand); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Subset.cs b/src/Microsoft.ML.Ensemble/Subset.cs new file mode 100644 index 0000000000..77183de6f3 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Subset.cs @@ -0,0 +1,22 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections; +using Microsoft.ML.Runtime.Data; + +namespace Microsoft.ML.Runtime.Ensemble +{ + public sealed class Subset + { + public readonly RoleMappedData Data; + public readonly BitArray SelectedFeatures; + + public Subset(RoleMappedData data, BitArray features = null) + { + Contracts.AssertValue(data); + Data = data; + SelectedFeatures = features; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs new file mode 100644 index 0000000000..a5eebad10f --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs @@ -0,0 +1,91 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Learners; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; + +[assembly: LoadableClass(EnsembleTrainer.Summary, typeof(EnsembleTrainer), typeof(EnsembleTrainer.Arguments), + new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer) }, + EnsembleTrainer.UserNameValue, EnsembleTrainer.LoadNameValue, "pe", "ParallelEnsemble")] + +namespace Microsoft.ML.Runtime.Ensemble +{ + using TDistPredictor = IDistPredictorProducing; + using TScalarPredictor = IPredictorProducing; + /// + /// A generic ensemble trainer for binary classification. + /// + public sealed class EnsembleTrainer : EnsembleTrainerBase, + IModelCombiner, TScalarPredictor> + { + public const string LoadNameValue = "WeightedEnsemble"; + public const string UserNameValue = "Parallel Ensemble (bagging, stacking, etc)"; + public const string Summary = "A generic ensemble classifier for binary classification."; + + public sealed class Arguments : ArgumentsBase + { + public Arguments() + { + BasePredictors = new[] { new SubComponent, SignatureBinaryClassifierTrainer>("LinearSVM") }; + OutputCombiner = new SubComponent(Median.LoadName); + SubModelSelectorType = new SubComponent(AllSelector.LoadName); + } + } + + public EnsembleTrainer(IHostEnvironment env, Arguments args) + : base(args, env, LoadNameValue) + { + } + + public override PredictionKind PredictionKind + { + get { return PredictionKind.BinaryClassification; } + } + + public override TScalarPredictor CreatePredictor() + { + if (Models.All(m => m.Predictor is TDistPredictor)) + return new EnsembleDistributionPredictor(Host, PredictionKind, CreateModels(), Combiner); + return new EnsemblePredictor(Host, PredictionKind, CreateModels(), Combiner); + } + + public TScalarPredictor CombineModels(IEnumerable> models) + { + var weights = models.Select(m => m.Weight).ToArray(); + if (weights.All(w => w == 1)) + weights = null; + var combiner = Args.OutputCombiner.CreateInstance(Host); + var p = models.First().Value; + + TScalarPredictor predictor = null; + if (p is TDistPredictor) + { + predictor = new EnsembleDistributionPredictor(Host, p.PredictionKind, + models.Select(k => new FeatureSubsetModel((TDistPredictor)k.Value)).ToArray(), + combiner, + weights); + } + else + { + predictor = new EnsemblePredictor(Host, p.PredictionKind, + models.Select(k => new FeatureSubsetModel(k.Value)).ToArray(), + combiner, + weights); + } + + return predictor; + } + } + +} \ No newline at end of file diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs new file mode 100644 index 0000000000..b183162f87 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs @@ -0,0 +1,232 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +// These are for deserialization from a model repository. +[assembly: LoadableClass(typeof(EnsembleDistributionPredictor), null, typeof(SignatureLoadModel), + EnsembleDistributionPredictor.UserName, + EnsembleDistributionPredictor.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble +{ + using TDistPredictor = IDistPredictorProducing; + public sealed class EnsembleDistributionPredictor : + EnsemblePredictorBase, + TDistPredictor, + IValueMapperDist + { + public const string UserName = "Ensemble Distribution Executor"; + public const string LoaderSignature = "EnsemDbExec"; + public const string RegistrationName = "EnsembleDistributionPredictor"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "ENSEM DB", + // verWrittenCur: 0x00010001, // Initial + //verWrittenCur: 0x00010002, // Metrics and subset info into main stream, after each predictor + verWrittenCur: 0x00010003, // Don't serialize the "IsAveraged" property of the metrics + verReadableCur: 0x00010003, + verWeCanReadBack: 0x00010002, + loaderSignature: LoaderSignature); + } + + private readonly Single[] _averagedWeights; + + private readonly PredictionKind _kind; + private readonly Median _probabilityCombiner; + + private readonly ColumnType _inputType; + private readonly IValueMapperDist[] _mappers; + + public ColumnType InputType { get { return _inputType; } } + public ColumnType OutputType { get { return NumberType.Float; } } + public ColumnType DistType { get { return NumberType.Float; } } + + internal EnsembleDistributionPredictor(IHostEnvironment env, PredictionKind kind, + FeatureSubsetModel[] models, IOutputCombiner combiner, Single[] weights = null) + : base(env, RegistrationName, models, combiner, weights) + { + _kind = kind; + _probabilityCombiner = new Median(env); + _inputType = InitializeMappers(out _mappers); + ComputeAveragedWeights(out _averagedWeights); + } + + private EnsembleDistributionPredictor(IHostEnvironment env, ModelLoadContext ctx) + : base(env, RegistrationName, ctx) + { + _kind = (PredictionKind)ctx.Reader.ReadInt32(); + _probabilityCombiner = new Median(env); + _inputType = InitializeMappers(out _mappers); + ComputeAveragedWeights(out _averagedWeights); + } + + private ColumnType InitializeMappers(out IValueMapperDist[] mappers) + { + Host.AssertNonEmpty(Models); + + mappers = new IValueMapperDist[Models.Length]; + ColumnType inputType = null; + for (int i = 0; i < Models.Length; i++) + { + var vmd = Models[i].Predictor as IValueMapperDist; + if (!IsValid(vmd)) + throw Host.Except("Predictor does not implement expected interface"); + if (vmd.InputType.VectorSize > 0) + { + if (inputType == null) + inputType = vmd.InputType; + else if (vmd.InputType.VectorSize != inputType.VectorSize) + throw Host.Except("Predictor input type mismatch"); + } + mappers[i] = vmd; + } + return inputType ?? new VectorType(NumberType.Float); + } + + private bool IsValid(IValueMapperDist mapper) + { + return mapper != null + && mapper.InputType.IsVector && mapper.InputType.ItemType == NumberType.Float + && mapper.OutputType == NumberType.Float + && mapper.DistType == NumberType.Float; + } + + public static EnsembleDistributionPredictor Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new EnsembleDistributionPredictor(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: _kind + ctx.Writer.Write((int)_kind); + } + + public override PredictionKind PredictionKind { get { return _kind; } } + + public ValueMapper GetMapper() + { + Host.Check(typeof(TIn) == typeof(VBuffer)); + Host.Check(typeof(TOut) == typeof(Single)); + + var combine = Combiner.GetCombiner(); + var maps = GetMaps(); + var predictions = new Single[_mappers.Length]; + var probabilities = new Single[_mappers.Length]; + var vBuffers = new VBuffer[_mappers.Length]; + ValueMapper, Single> del = + (ref VBuffer src, ref Single dst) => + { + if (_inputType.VectorSize > 0) + Host.Check(src.Length == _inputType.VectorSize); + + var tmp = src; + Parallel.For(0, maps.Length, i => + { + var model = Models[i]; + if (model.SelectedFeatures != null) + { + EnsembleUtils.SelectFeatures(ref tmp, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]); + maps[i](ref vBuffers[i], ref predictions[i], ref probabilities[i]); + } + else + maps[i](ref tmp, ref predictions[i], ref probabilities[i]); + }); + + // REVIEW ansarim(nihejazi): Bug 3303: DistributionEnsemble - AveragedWeights are used only in one of the two PredictDistributions overloads + combine(ref dst, predictions, Weights); + }; + + return (ValueMapper)(Delegate)del; + } + + public ValueMapper GetMapper() + { + Host.Check(typeof(TIn) == typeof(VBuffer)); + Host.Check(typeof(TOut) == typeof(Single)); + Host.Check(typeof(TDist) == typeof(Single)); + + var combine = Combiner.GetCombiner(); + var combineProb = _probabilityCombiner.GetCombiner(); + var maps = GetMaps(); + var predictions = new Single[_mappers.Length]; + var probabilities = new Single[_mappers.Length]; + var vBuffers = new VBuffer[_mappers.Length]; + ValueMapper, Single, Single> del = + (ref VBuffer src, ref Single score, ref Single prob) => + { + if (_inputType.VectorSize > 0) + Host.Check(src.Length == _inputType.VectorSize); + + var tmp = src; + Parallel.For(0, maps.Length, i => + { + var model = Models[i]; + if (model.SelectedFeatures != null) + { + EnsembleUtils.SelectFeatures(ref tmp, model.SelectedFeatures, model.Cardinality, ref vBuffers[i]); + maps[i](ref vBuffers[i], ref predictions[i], ref probabilities[i]); + } + else + maps[i](ref tmp, ref predictions[i], ref probabilities[i]); + }); + + combine(ref score, predictions, _averagedWeights); + combineProb(ref prob, probabilities, _averagedWeights); + }; + + return (ValueMapper)(Delegate)del; + } + + private ValueMapper, Single, Single>[] GetMaps() + { + Host.AssertValue(_mappers); + + var maps = new ValueMapper, Single, Single>[_mappers.Length]; + for (int i = 0; i < _mappers.Length; i++) + maps[i] = _mappers[i].GetMapper, Single, Single>(); + return maps; + } + + private void ComputeAveragedWeights(out Single[] averagedWeights) + { + averagedWeights = Weights; + if (Combiner is IWeightedAverager weightedAverager && averagedWeights == null && Models[0].Metrics != null) + { + var metric = default(KeyValuePair); + bool found = false; + foreach (var m in Models[0].Metrics) + { + metric = m; + if (Utils.ExtractLettersAndNumbers(m.Key).ToLower().Equals(weightedAverager.WeightageMetricName.ToLower())) + { + found = true; + break; + } + } + if (found) + averagedWeights = Models.SelectMany(model => model.Metrics).Where(m => m.Key == metric.Key).Select(m => (Single)m.Value).ToArray(); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs new file mode 100644 index 0000000000..a5b5aa026d --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs @@ -0,0 +1,148 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Threading.Tasks; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Model; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; + +[assembly: LoadableClass(typeof(EnsemblePredictor), null, typeof(SignatureLoadModel), + EnsemblePredictor.UserName, EnsemblePredictor.LoaderSignature)] + +namespace Microsoft.ML.Runtime.Ensemble +{ + using TScalarPredictor = IPredictorProducing; + public sealed class EnsemblePredictor : + EnsemblePredictorBase, + IValueMapper + { + public const string UserName = "Ensemble Executor"; + public const string LoaderSignature = "EnsembleFloatExec"; + public const string RegistrationName = "EnsemblePredictor"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "ENSEM XX", + // verWrittenCur: 0x00010001, // Initial + //verWrittenCur: 0x00010002, // Metrics and subset info into main stream, after each predictor + verWrittenCur: 0x00010003, // Don't serialize the "IsAveraged" property of the metrics + verReadableCur: 0x00010003, + verWeCanReadBack: 0x00010002, + loaderSignature: LoaderSignature); + } + + private readonly PredictionKind _kind; + private readonly ColumnType _inputType; + private readonly IValueMapper[] _mappers; + + public ColumnType InputType { get { return _inputType; } } + public ColumnType OutputType { get { return NumberType.Float; } } + public override PredictionKind PredictionKind { get { return _kind; } } + + internal EnsemblePredictor(IHostEnvironment env, PredictionKind kind, + FeatureSubsetModel[] models, IOutputCombiner combiner, Single[] weights = null) + : base(env, LoaderSignature, models, combiner, weights) + { + _kind = kind; + _inputType = InitializeMappers(out _mappers); + } + + private EnsemblePredictor(IHostEnvironment env, ModelLoadContext ctx) + : base(env, RegistrationName, ctx) + { + _kind = (PredictionKind)ctx.Reader.ReadInt32(); + _inputType = InitializeMappers(out _mappers); + } + + private ColumnType InitializeMappers(out IValueMapper[] mappers) + { + Host.AssertNonEmpty(Models); + + mappers = new IValueMapper[Models.Length]; + ColumnType inputType = null; + for (int i = 0; i < Models.Length; i++) + { + var vm = Models[i].Predictor as IValueMapper; + if (!IsValid(vm)) + throw Host.Except("Predictor does not implement expected interface"); + if (vm.InputType.VectorSize > 0) + { + if (inputType == null) + inputType = vm.InputType; + else if (vm.InputType.VectorSize != inputType.VectorSize) + throw Host.Except("Predictor input type mismatch"); + } + mappers[i] = vm; + } + + return inputType ?? new VectorType(NumberType.Float); + } + + private bool IsValid(IValueMapper mapper) + { + return mapper != null + && mapper.InputType.IsVector && mapper.InputType.ItemType == NumberType.Float + && mapper.OutputType == NumberType.Float; + } + + public static EnsemblePredictor Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new EnsemblePredictor(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // int: _kind + ctx.Writer.Write((int)_kind); + } + + public ValueMapper GetMapper() + { + Host.Check(typeof(TIn) == typeof(VBuffer)); + Host.Check(typeof(TOut) == typeof(Single)); + + var combine = Combiner.GetCombiner(); + var predictions = new Single[_mappers.Length]; + var buffers = new VBuffer[_mappers.Length]; + var maps = new ValueMapper, Single>[_mappers.Length]; + for (int i = 0; i < _mappers.Length; i++) + maps[i] = _mappers[i].GetMapper, Single>(); + + ValueMapper, Single> del = + (ref VBuffer src, ref Single dst) => + { + if (_inputType.VectorSize > 0) + Host.Check(src.Length == _inputType.VectorSize); + + var tmp = src; + Parallel.For(0, maps.Length, i => + { + var model = Models[i]; + if (model.SelectedFeatures != null) + { + EnsembleUtils.SelectFeatures(ref tmp, model.SelectedFeatures, model.Cardinality, ref buffers[i]); + maps[i](ref buffers[i], ref predictions[i]); + } + else + maps[i](ref tmp, ref predictions[i]); + }); + + combine(ref dst, predictions, Weights); + }; + + return (ValueMapper)(Delegate)del; + } + } +} \ No newline at end of file diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs new file mode 100644 index 0000000000..a577a39f1d --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs @@ -0,0 +1,167 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Model; + +namespace Microsoft.ML.Runtime.Ensemble +{ + public abstract class EnsemblePredictorBase : PredictorBase, + IPredictorProducing, + ICanSaveInTextFormat, + ICanSaveModel, + ICanSaveSummary + where TPredictor : class, IPredictorProducing + { + private const string SubPredictorFmt = "SubPredictor_{0:000}"; + + protected readonly FeatureSubsetModel[] Models; + protected readonly IOutputCombiner Combiner; + protected readonly Single[] Weights; + + private const uint VerOld = 0x00010002; + + protected EnsemblePredictorBase(IHostEnvironment env, string name, FeatureSubsetModel[] models, + IOutputCombiner combiner, Single[] weights) + : base(env, name) + { + + Host.Check(Utils.Size(models) > 0, "Ensemble was created with no models."); + Host.Check(weights == null || weights.Length == models.Length); + + Models = models; + Combiner = combiner; + Weights = weights; + } + + protected EnsemblePredictorBase(IHostEnvironment env, string name, ModelLoadContext ctx) + : base(env, name, ctx) + { + // *** Binary format *** + // int: model count + // int: weight count (0 or model count) + // Float[]: weights + // for each model: + // int: number of SelectedFeatures (in bits) + // byte[]: selected features (as many as needed for number of bits == (numSelectedFeatures + 7) / 8) + // int: number of Metric values + // for each Metric: + // Float: metric value + // int: metric name (id of the metric name in the string table) + // in version 0x0001x0002: + // bool: is the metric averaged + + int count = ctx.Reader.ReadInt32(); + Host.CheckDecode(count > 0); + + int weightCount = ctx.Reader.ReadInt32(); + Host.CheckDecode(weightCount == 0 || weightCount == count); + Weights = ctx.Reader.ReadFloatArray(weightCount); + + Models = new FeatureSubsetModel[count]; + var ver = ctx.Header.ModelVerWritten; + for (int i = 0; i < count; i++) + { + ctx.LoadModel(Host, out IPredictor p, string.Format(SubPredictorFmt, i)); + var predictor = p as TPredictor; + Host.Check(p != null, "Inner predictor type not compatible with the ensemble type."); + var features = ctx.Reader.ReadBitArray(); + int numMetrics = ctx.Reader.ReadInt32(); + Host.CheckDecode(numMetrics >= 0); + var metrics = new KeyValuePair[numMetrics]; + for (int j = 0; j < numMetrics; j++) + { + var metricValue = ctx.Reader.ReadFloat(); + var metricName = ctx.LoadStringOrNull(); + if (ver == VerOld) + ctx.Reader.ReadBoolByte(); + metrics[j] = new KeyValuePair(metricName, metricValue); + } + Models[i] = new FeatureSubsetModel(predictor, features, metrics); + } + ctx.LoadModel, SignatureLoadModel>(Host, out Combiner, @"Combiner"); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + + // *** Binary format *** + // int: model count + // int: weight count (0 or model count) + // Single[]: weights + // for each model: + // int: number of SelectedFeatures (in bits) + // byte[]: selected features (as many as needed for number of bits == (numSelectedFeatures + 7) / 8) + // int: number of Metric values + // for each Metric: + // Single: metric value + // int: metric name (id of the metric name in the string table) + + ctx.Writer.Write(Models.Length); + ctx.Writer.WriteFloatArray(Weights); + + // Save other streams. + for (int i = 0; i < Models.Length; i++) + { + var model = Models[i]; + ctx.SaveModel(model.Predictor, string.Format(SubPredictorFmt, i)); + Host.AssertValueOrNull(model.SelectedFeatures); + ctx.Writer.WriteBitArray(model.SelectedFeatures); + Host.AssertValueOrNull(model.Metrics); + int numMetrics = Utils.Size(model.Metrics); + ctx.Writer.Write(numMetrics); + for (int j = 0; j < numMetrics; j++) + { + var metric = model.Metrics[j]; + ctx.Writer.Write((Single)metric.Value); + ctx.SaveStringOrNull(metric.Key); + } + } + ctx.SaveModel(Combiner, @"Combiner"); + } + + /// + /// Output the INI model to a given writer + /// + public void SaveAsText(TextWriter writer, RoleMappedSchema schema) + { + using (var ch = Host.Start("SaveAsText")) + { + for (int i = 0; i < Models.Length; i++) + { + writer.WriteLine(";; Partition model {0}", i); + writer.WriteLine(";; Weight={0}", (Weights != null ? Weights[i] : 1)); + PredictorUtils.SaveText(ch, Models[i].Predictor, schema, writer); + } + ch.Done(); + } + } + + /// + /// Saves the model summary + /// + public void SaveSummary(TextWriter writer, RoleMappedSchema schema) + { + for (int i = 0; i < Models.Length; i++) + { + writer.WriteLine(";; Partition model {0}", i); + writer.WriteLine(";; Weight={0}", (Weights != null ? Weights[i] : 1)); + + // REVIEW ansarim: The featureName Collection names may vary for different base learners. + // How do we get the right collection for the base learners? + if (Models[i].Predictor is ICanSaveSummary summaryModel) + summaryModel.SaveSummary(writer, schema); + else + writer.WriteLine("The Model {0} does not support saving summaries", Models[i].GetType().Name); + } + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs new file mode 100644 index 0000000000..724b57c45f --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs @@ -0,0 +1,253 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; +using Microsoft.ML.Runtime.Internal.Internallearn; +using Microsoft.ML.Runtime.Internal.Utilities; +using Microsoft.ML.Runtime.Training; + +namespace Microsoft.ML.Runtime.Ensemble +{ + using Stopwatch = System.Diagnostics.Stopwatch; + public abstract class EnsembleTrainerBase : TrainerBase + where TPredictor : class, IPredictorProducing + where TSelector : class, ISubModelSelector + where TCombiner : class, IOutputCombiner + { + public abstract class ArgumentsBase + { + [Argument(ArgumentType.AtMostOnce, + HelpText = "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, " + + "or the number of base predictors otherwise.", ShortName = "nm", SortOrder = 3)] + [TGUI(Label = "Number of Models per batch")] + public int? NumModels; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Batch size", ShortName = "bs", SortOrder = 107)] + [TGUI(Label = "Batch Size", + Description = + "Number of instances to be loaded in memory to create an ensemble out of it. All the instances will be loaded if the value is -1.")] + public int BatchSize = -1; + + [Argument(ArgumentType.Multiple, HelpText = "Sampling Type", ShortName = "st", SortOrder = 2)] + [TGUI(Label = "Sampling Type", Description = "Subset Selection Algorithm to induce the base learner.Sub-settings can be used to select the features")] + public SubComponent SamplingType + = new SubComponent(BootstrapSelector.LoadName); + + [Argument(ArgumentType.AtMostOnce, HelpText = "All the base learners will run asynchronously if the value is true", ShortName = "tp", SortOrder = 106)] + [TGUI(Label = "Train parallel", Description = "All the base learners will run asynchronously if the value is true")] + public bool TrainParallel; + + [Argument(ArgumentType.AtMostOnce, + HelpText = "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", + ShortName = "sm", SortOrder = 108)] + [TGUI(Label = "Show Sub-Model Metrics")] + public bool ShowMetrics; + + [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] + [TGUI(Label = "Output combiner", Description = "Output combiner type")] + public SubComponent OutputCombiner; + + [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] + [TGUI(Label = "Sub-Model Selector(pruning) Type", + Description = "Algorithm to prune the base learners for selective Ensemble")] + public SubComponent SubModelSelectorType; + + [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1)] + public SubComponent>, TSig>[] BasePredictors; + + public const int DefaultNumModels = 50; + } + + /// Command-line arguments + protected readonly ArgumentsBase Args; + protected readonly int NumModels; + + /// Ensemble members + protected readonly ITrainer>[] Trainers; + + private readonly ISubsetSelector _subsetSelector; + private readonly TSelector _subModelSelector; + + protected readonly TCombiner Combiner; + + protected List>> Models; + + private readonly bool _needNorm; + private readonly bool _needCalibration; + + internal EnsembleTrainerBase(ArgumentsBase args, IHostEnvironment env, string name) + : base(env, name) + { + Args = args; + + using (var ch = Host.Start("Init")) + { + ch.CheckUserArg(Utils.Size(Args.BasePredictors) > 0, nameof(Args.BasePredictors), "This should have at-least one value"); + + NumModels = Args.NumModels ?? + (Args.BasePredictors.Length == 1 ? ArgumentsBase.DefaultNumModels : Args.BasePredictors.Length); + + ch.CheckUserArg(NumModels > 0, nameof(Args.NumModels), "Must be positive, or null to indicate numModels is the number of base predictors"); + + if (Utils.Size(Args.BasePredictors) > NumModels) + ch.Warning("The base predictor count is greater than models count. Some of the base predictors will be ignored."); + + _subsetSelector = Args.SamplingType.CreateInstance(Host); + _subModelSelector = Args.SubModelSelectorType.CreateInstance(Host); + Combiner = Args.OutputCombiner.CreateInstance(Host); + + Trainers = new ITrainer>[NumModels]; + for (int i = 0; i < Trainers.Length; i++) + Trainers[i] = Args.BasePredictors[i % Args.BasePredictors.Length].CreateInstance(Host); + _needNorm = Trainers.Any( + t => + { + return t is ITrainerEx nn && nn.NeedNormalization; + }); + _needCalibration = Trainers.Any( + t => + { + return t is ITrainerEx nn && nn.NeedCalibration; + }); + ch.Done(); + } + } + + public override bool NeedNormalization { get { return _needNorm; } } + + public override bool NeedCalibration { get { return _needCalibration; } } + + // No matter the internal predictors, we are performing multiple passes over the data + // so it is probably appropriate to always cache. + public override bool WantCaching { get { return true; } } + + public override void Train(RoleMappedData data) + { + using (var ch = Host.Start("Training")) + { + TrainCore(ch, data); + ch.Done(); + } + } + + private void TrainCore(IChannel ch, RoleMappedData data) + { + Host.AssertValue(ch); + ch.AssertValue(data); + + // 1. Subset Selection + var stackingTrainer = Combiner as IStackingTrainer; + + //REVIEW ansarim: Implement stacking for Batch mode. + ch.CheckUserArg(stackingTrainer == null || Args.BatchSize <= 0, nameof(Args.BatchSize), "Stacking works only with Non-batch mode"); + + var validationDataSetProportion = _subModelSelector.ValidationDatasetProportion; + if (stackingTrainer != null) + validationDataSetProportion = Math.Max(validationDataSetProportion, stackingTrainer.ValidationDatasetProportion); + + var needMetrics = Args.ShowMetrics || Combiner is IWeightedAverager; + + _subsetSelector.Initialize(data, NumModels, Args.BatchSize, validationDataSetProportion); + int batchNumber = 1; + foreach (var batch in _subsetSelector.GetBatches(Host.Rand)) + { + // 2. Core train + ch.Info("Training {0} learners for the batch {1}", Trainers.Length, batchNumber++); + var models = new FeatureSubsetModel>[Trainers.Length]; + + Parallel.ForEach(_subsetSelector.GetSubsets(batch, Host.Rand), + new ParallelOptions() { MaxDegreeOfParallelism = Args.TrainParallel ? -1 : 1 }, + (subset, state, index) => + { + ch.Info("Beginning training model {0} of {1}", index + 1, Trainers.Length); + Stopwatch sw = Stopwatch.StartNew(); + try + { + if (EnsureMinimumFeaturesSelected(subset)) + { + Trainers[(int)index].Train(subset.Data); + + var model = new FeatureSubsetModel>( + Trainers[(int)index].CreatePredictor(), + subset.SelectedFeatures, + null); + _subModelSelector.CalculateMetrics(model, _subsetSelector, subset, batch, needMetrics); + models[(int)index] = model; + } + } + catch (Exception ex) + { + ch.Assert(models[(int)index] == null); + ch.Warning(ex.Sensitivity(), "Trainer {0} of {1} was not learned properly due to the exception '{2}' and will not be added to models.", + index + 1, Trainers.Length, ex.Message); + } + ch.Info("Trainer {0} of {1} finished in {2}", index + 1, Trainers.Length, sw.Elapsed); + }); + + var modelsList = models.Where(m => m != null).ToList(); + if (Args.ShowMetrics) + PrintMetrics(ch, modelsList); + + modelsList = _subModelSelector.Prune(modelsList).ToList(); + + if (stackingTrainer != null) + stackingTrainer.Train(modelsList, _subsetSelector.GetTestData(null, batch), Host); + + foreach (var model in modelsList) + Utils.Add(ref Models, model); + int modelSize = Utils.Size(Models); + if (modelSize < Utils.Size(Trainers)) + ch.Warning("{0} of {1} trainings failed.", Utils.Size(Trainers) - modelSize, Utils.Size(Trainers)); + ch.Check(modelSize > 0, "Ensemble training resulted in no valid models."); + } + } + + private bool EnsureMinimumFeaturesSelected(Subset subset) + { + if (subset.SelectedFeatures == null) + return true; + for (int i = 0; i < subset.SelectedFeatures.Count; i++) + { + if (subset.SelectedFeatures[i]) + return true; + } + + return false; + } + + protected virtual void PrintMetrics(IChannel ch, List>> models) + { + // REVIEW tfinley: The formatting of this method is bizarre and seemingly not even self-consistent + // w.r.t. its usage of |. Is this intentional? + if (models.Count == 0 || models[0].Metrics == null) + return; + + ch.Info("{0}| Name of Model |", string.Join("", models[0].Metrics.Select(m => string.Format("| {0} |", m.Key)))); + + foreach (var model in models) + ch.Info("{0}{1}", string.Join("", model.Metrics.Select(m => string.Format("| {0} |", m.Value))), model.Predictor.GetType().Name); + } + + protected FeatureSubsetModel[] CreateModels() where T : IPredictor + { + var models = new FeatureSubsetModel[Models.Count]; + for (int i = 0; i < Models.Count; i++) + { + models[i] = new FeatureSubsetModel( + (T)Models[i].Predictor, + Models[i].SelectedFeatures, + Models[i].Metrics); + } + return models; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs new file mode 100644 index 0000000000..3c5561f585 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs @@ -0,0 +1,152 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Threading.Tasks; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Model; + +namespace Microsoft.ML.Runtime.Ensemble +{ + using TVectorPredictor = IPredictorProducing>; + public sealed class EnsembleMultiClassPredictor : + EnsemblePredictorBase>, + IValueMapper + { + public const string LoaderSignature = "EnsemMcExec"; + public const string RegistrationName = "EnsembleMultiClassPredictor"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "ENSEM MC", + // verWrittenCur: 0x00010001, // Initial + //verWrittenCur: 0x00010002, // Metrics and subset info into main stream, after each predictor + verWrittenCur: 0x00010003, // Don't serialize the "IsAveraged" property of the metrics + verReadableCur: 0x00010003, + verWeCanReadBack: 0x00010002, + loaderSignature: LoaderSignature); + } + + private readonly ColumnType _inputType; + private readonly ColumnType _outputType; + private readonly IValueMapper[] _mappers; + + public ColumnType InputType { get { return _inputType; } } + public ColumnType OutputType { get { return _outputType; } } + + internal EnsembleMultiClassPredictor(IHostEnvironment env, FeatureSubsetModel[] models, + IOutputCombiner> combiner, Single[] weights = null) + : base(env, RegistrationName, models, combiner, weights) + { + InitializeMappers(out _mappers, out _inputType, out _outputType); + } + + private EnsembleMultiClassPredictor(IHostEnvironment env, ModelLoadContext ctx) + : base(env, RegistrationName, ctx) + { + InitializeMappers(out _mappers, out _inputType, out _outputType); + } + + private void InitializeMappers(out IValueMapper[] mappers, out ColumnType inputType, out ColumnType outputType) + { + Host.AssertNonEmpty(Models); + + mappers = new IValueMapper[Models.Length]; + inputType = null; + outputType = null; + for (int i = 0; i < Models.Length; i++) + { + var vm = Models[i].Predictor as IValueMapper; + if (!IsValid(vm)) + throw Host.Except("Predictor does not implement expected interface"); + if (vm.InputType.VectorSize > 0) + { + if (inputType == null) + inputType = vm.InputType; + else if (vm.InputType.VectorSize != inputType.VectorSize) + throw Host.Except("Predictor input type mismatch"); + } + + if (outputType == null || vm.OutputType.VectorSize > outputType.VectorSize) + outputType = vm.OutputType; + + mappers[i] = vm; + } + Host.AssertValue(outputType); + + if (inputType == null) + inputType = new VectorType(NumberType.Float); + } + + public static EnsembleMultiClassPredictor Create(IHostEnvironment env, ModelLoadContext ctx) + { + Contracts.CheckValue(env, nameof(env)); + env.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(GetVersionInfo()); + return new EnsembleMultiClassPredictor(env, ctx); + } + + protected override void SaveCore(ModelSaveContext ctx) + { + base.SaveCore(ctx); + ctx.SetVersionInfo(GetVersionInfo()); + } + + public override PredictionKind PredictionKind { get { return PredictionKind.MultiClassClassification; } } + + public ValueMapper GetMapper() + { + Host.Check(typeof(TIn) == typeof(VBuffer)); + Host.Check(typeof(TOut) == typeof(VBuffer)); + + var combine = Combiner.GetCombiner(); + var features = new VBuffer[_mappers.Length]; + var predictions = new VBuffer[_mappers.Length]; + var maps = new ValueMapper, VBuffer>[_mappers.Length]; + for (int i = 0; i < _mappers.Length; i++) + { + // IsValid method ensures we go this else path only if the OutputType.VectorSize of + // all _mappers is greater than zero + Host.Assert(_mappers[i].OutputType.VectorSize > 0); + maps[i] = _mappers[i].GetMapper, VBuffer>(); + } + + ValueMapper, VBuffer> del = + (ref VBuffer src, ref VBuffer dst) => + { + if (_inputType.VectorSize > 0) + Host.Check(src.Length == _inputType.VectorSize); + + var tmp = src; + Parallel.For(0, maps.Length, i => + { + var model = Models[i]; + if (model.SelectedFeatures != null) + { + EnsembleUtils.SelectFeatures(ref tmp, model.SelectedFeatures, model.Cardinality, ref features[i]); + maps[i](ref features[i], ref predictions[i]); + } + else + maps[i](ref tmp, ref predictions[i]); + + // individual maps delegates will return always the same VBuffer length + Host.Check(predictions[i].Length == _mappers[i].OutputType.VectorSize); + }); + + combine(ref dst, predictions, Weights); + }; + + return (ValueMapper)(Delegate)del; + } + + private bool IsValid(IValueMapper mapper) + { + return mapper != null + && mapper.InputType.IsVector && mapper.InputType.ItemType == NumberType.Float + && mapper.OutputType.VectorSize > 0 && mapper.OutputType.ItemType == NumberType.Float; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs new file mode 100644 index 0000000000..f53412c922 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs @@ -0,0 +1,74 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Learners; + +[assembly: LoadableClass(MulticlassDataPartitionEnsembleTrainer.Summary, typeof(MulticlassDataPartitionEnsembleTrainer), + typeof(MulticlassDataPartitionEnsembleTrainer.Arguments), + new[] { typeof(SignatureMultiClassClassifierTrainer), typeof(SignatureTrainer) }, + MulticlassDataPartitionEnsembleTrainer.UserNameValue, + MulticlassDataPartitionEnsembleTrainer.LoadNameValue)] + +namespace Microsoft.ML.Runtime.Ensemble +{ + using TVectorPredictor = IPredictorProducing>; + /// + /// A generic ensemble classifier for multi-class classification + /// + public sealed class MulticlassDataPartitionEnsembleTrainer : + EnsembleTrainerBase, EnsembleMultiClassPredictor, + ISubModelSelector>, IOutputCombiner>, SignatureMultiClassClassifierTrainer>, + IModelCombiner, TVectorPredictor> + { + public const string LoadNameValue = "WeightedEnsembleMulticlass"; + public const string UserNameValue = "Multi-class Parallel Ensemble (bagging, stacking, etc)"; + public const string Summary = "A generic ensemble classifier for multi-class classification."; + + public sealed class Arguments : ArgumentsBase + { + public Arguments() + { + BasePredictors = new[] { new SubComponent, SignatureMultiClassClassifierTrainer>("MultiClassLogisticRegression") }; + OutputCombiner = new SubComponent>, SignatureCombiner>(MultiMedian.LoadName); + SubModelSelectorType = new SubComponent>, SignatureEnsembleSubModelSelector>(AllSelectorMultiClass.LoadName); + } + } + + public MulticlassDataPartitionEnsembleTrainer(IHostEnvironment env, Arguments args) + : base(args, env, LoadNameValue) + { + } + + public override PredictionKind PredictionKind { get { return PredictionKind.MultiClassClassification; } } + + public override EnsembleMultiClassPredictor CreatePredictor() + { + var combiner = Combiner; + return new EnsembleMultiClassPredictor(Host, CreateModels(), combiner); + } + + public TVectorPredictor CombineModels(IEnumerable> models) + { + var weights = models.Select(m => m.Weight).ToArray(); + if (weights.All(w => w == 1)) + weights = null; + + var predictor = new EnsembleMultiClassPredictor(Host, + models.Select(k => new FeatureSubsetModel(k.Value)).ToArray(), + Args.OutputCombiner.CreateInstance(Host), weights); + + return predictor; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs new file mode 100644 index 0000000000..7956f835a4 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Learners; + +[assembly: LoadableClass(typeof(RegressionEnsembleTrainer), typeof(RegressionEnsembleTrainer.Arguments), + new[] { typeof(SignatureRegressorTrainer), typeof(SignatureTrainer) }, + RegressionEnsembleTrainer.UserNameValue, + RegressionEnsembleTrainer.LoadNameValue)] + +namespace Microsoft.ML.Runtime.Ensemble +{ + using TScalarPredictor = IPredictorProducing; + public sealed class RegressionEnsembleTrainer : EnsembleTrainerBase, + IModelCombiner, TScalarPredictor> + { + public const string LoadNameValue = "EnsembleRegression"; + public const string UserNameValue = "Regression Ensemble (bagging, stacking, etc)"; + + public sealed class Arguments : ArgumentsBase + { + public Arguments() + { + BasePredictors = new[] { new SubComponent, SignatureRegressorTrainer>("OnlineGradientDescent") }; + OutputCombiner = new SubComponent(Median.LoadName); + SubModelSelectorType = new SubComponent(AllSelector.LoadName); + } + } + + public RegressionEnsembleTrainer(IHostEnvironment env, Arguments args) + : base(args, env, LoadNameValue) + { + } + + public override PredictionKind PredictionKind + { + get { return PredictionKind.Regression; } + } + + public override TScalarPredictor CreatePredictor() + { + return new EnsemblePredictor(Host, PredictionKind, CreateModels(), Combiner); + } + + public TScalarPredictor CombineModels(IEnumerable> models) + { + var weights = models.Select(m => m.Weight).ToArray(); + if (weights.All(w => w == 1)) + weights = null; + var combiner = Args.OutputCombiner.CreateInstance(Host); + var p = models.First().Value; + + var predictor = new EnsemblePredictor(Host, p.PredictionKind, + models.Select(k => new FeatureSubsetModel(k.Value)).ToArray(), + combiner, + weights); + + return predictor; + } + } +} diff --git a/src/Microsoft.ML.Ensemble/WeightedValue.cs b/src/Microsoft.ML.Ensemble/WeightedValue.cs new file mode 100644 index 0000000000..9c655ff4db --- /dev/null +++ b/src/Microsoft.ML.Ensemble/WeightedValue.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; + +namespace Microsoft.ML.Runtime.Ensemble +{ + public struct WeightedValue + { + public T Value; + public Single Weight; + } +} diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index e943e76bca..58ea963f37 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -106,6 +106,18 @@ public void Add(Microsoft.ML.Models.AnomalyDetectionEvaluator input, Microsoft.M _jsonNodes.Add(Serialize("Models.AnomalyDetectionEvaluator", input, output)); } + public Microsoft.ML.Models.AnomalyPipelineEnsemble.Output Add(Microsoft.ML.Models.AnomalyPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.AnomalyPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.AnomalyPipelineEnsemble input, Microsoft.ML.Models.AnomalyPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.AnomalyPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.BinaryClassificationEvaluator.Output Add(Microsoft.ML.Models.BinaryClassificationEvaluator input) { var output = new Microsoft.ML.Models.BinaryClassificationEvaluator.Output(); @@ -130,6 +142,30 @@ public void Add(Microsoft.ML.Models.BinaryCrossValidator input, Microsoft.ML.Mod _jsonNodes.Add(Serialize("Models.BinaryCrossValidator", input, output)); } + public Microsoft.ML.Models.BinaryEnsemble.Output Add(Microsoft.ML.Models.BinaryEnsemble input) + { + var output = new Microsoft.ML.Models.BinaryEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.BinaryEnsemble input, Microsoft.ML.Models.BinaryEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.BinaryEnsemble", input, output)); + } + + public Microsoft.ML.Models.BinaryPipelineEnsemble.Output Add(Microsoft.ML.Models.BinaryPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.BinaryPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.BinaryPipelineEnsemble input, Microsoft.ML.Models.BinaryPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.BinaryPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.ClassificationEvaluator.Output Add(Microsoft.ML.Models.ClassificationEvaluator input) { var output = new Microsoft.ML.Models.ClassificationEvaluator.Output(); @@ -202,6 +238,18 @@ public void Add(Microsoft.ML.Models.DatasetTransformer input, Microsoft.ML.Model _jsonNodes.Add(Serialize("Models.DatasetTransformer", input, output)); } + public Microsoft.ML.Models.EnsembleSummary.Output Add(Microsoft.ML.Models.EnsembleSummary input) + { + var output = new Microsoft.ML.Models.EnsembleSummary.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.EnsembleSummary input, Microsoft.ML.Models.EnsembleSummary.Output output) + { + _jsonNodes.Add(Serialize("Models.EnsembleSummary", input, output)); + } + public Microsoft.ML.Models.FixedPlattCalibrator.Output Add(Microsoft.ML.Models.FixedPlattCalibrator input) { var output = new Microsoft.ML.Models.FixedPlattCalibrator.Output(); @@ -214,6 +262,18 @@ public void Add(Microsoft.ML.Models.FixedPlattCalibrator input, Microsoft.ML.Mod _jsonNodes.Add(Serialize("Models.FixedPlattCalibrator", input, output)); } + public Microsoft.ML.Models.MultiClassPipelineEnsemble.Output Add(Microsoft.ML.Models.MultiClassPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.MultiClassPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.MultiClassPipelineEnsemble input, Microsoft.ML.Models.MultiClassPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.MultiClassPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.MultiOutputRegressionEvaluator.Output Add(Microsoft.ML.Models.MultiOutputRegressionEvaluator input) { var output = new Microsoft.ML.Models.MultiOutputRegressionEvaluator.Output(); @@ -334,6 +394,18 @@ public void Add(Microsoft.ML.Models.RankerEvaluator input, Microsoft.ML.Models.R _jsonNodes.Add(Serialize("Models.RankerEvaluator", input, output)); } + public Microsoft.ML.Models.RegressionEnsemble.Output Add(Microsoft.ML.Models.RegressionEnsemble input) + { + var output = new Microsoft.ML.Models.RegressionEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.RegressionEnsemble input, Microsoft.ML.Models.RegressionEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.RegressionEnsemble", input, output)); + } + public Microsoft.ML.Models.RegressionEvaluator.Output Add(Microsoft.ML.Models.RegressionEvaluator input) { var output = new Microsoft.ML.Models.RegressionEvaluator.Output(); @@ -346,6 +418,18 @@ public void Add(Microsoft.ML.Models.RegressionEvaluator input, Microsoft.ML.Mode _jsonNodes.Add(Serialize("Models.RegressionEvaluator", input, output)); } + public Microsoft.ML.Models.RegressionPipelineEnsemble.Output Add(Microsoft.ML.Models.RegressionPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.RegressionPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.RegressionPipelineEnsemble input, Microsoft.ML.Models.RegressionPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.RegressionPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.Summarizer.Output Add(Microsoft.ML.Models.Summarizer input) { var output = new Microsoft.ML.Models.Summarizer.Output(); @@ -1796,6 +1880,44 @@ public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IEva } } + namespace Models + { + public enum EnsembleCreatorScoreCombiner + { + Median = 0, + Average = 1 + } + + + /// + /// Combine anomaly detection models into an ensemble + /// + public sealed partial class AnomalyPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorScoreCombiner ModelCombiner { get; set; } = EnsembleCreatorScoreCombiner.Average; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -1983,6 +2105,82 @@ public sealed class Output } } + namespace Models + { + public enum EnsembleCreatorClassifierCombiner + { + Median = 0, + Average = 1, + Vote = 2 + } + + + /// + /// Combine binary classifiers into an ensemble + /// + public sealed partial class BinaryEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorClassifierCombiner ModelCombiner { get; set; } = EnsembleCreatorClassifierCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + /// + /// Whether to validate that all the pipelines are identical + /// + public bool ValidatePipelines { get; set; } = true; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + + namespace Models + { + + /// + /// Combine binary classification models into an ensemble + /// + public sealed partial class BinaryPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorClassifierCombiner ModelCombiner { get; set; } = EnsembleCreatorClassifierCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -2461,6 +2659,38 @@ public DatasetTransformerPipelineStep(Output output) } } + namespace Models + { + + /// + /// Summarize a pipeline ensemble predictor. + /// + public sealed partial class EnsembleSummary + { + + + /// + /// The predictor to summarize + /// + public Var PredictorModel { get; set; } = new Var(); + + + public sealed class Output + { + /// + /// The summaries of the individual predictors + /// + public ArrayVar Summaries { get; set; } = new ArrayVar(); + + /// + /// The model statistics of the individual predictors + /// + public ArrayVar Stats { get; set; } = new ArrayVar(); + + } + } + } + namespace Models { @@ -2535,6 +2765,38 @@ public FixedPlattCalibratorPipelineStep(Output output) } } + namespace Models + { + + /// + /// Combine multiclass classifiers into an ensemble + /// + public sealed partial class MultiClassPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorClassifierCombiner ModelCombiner { get; set; } = EnsembleCreatorClassifierCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -3271,6 +3533,43 @@ public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IEva } } + namespace Models + { + + /// + /// Combine regression models into an ensemble + /// + public sealed partial class RegressionEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorScoreCombiner ModelCombiner { get; set; } = EnsembleCreatorScoreCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + /// + /// Whether to validate that all the pipelines are identical + /// + public bool ValidatePipelines { get; set; } = true; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -3339,6 +3638,38 @@ public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IEva } } + namespace Models + { + + /// + /// Combine regression models into an ensemble + /// + public sealed partial class RegressionPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorScoreCombiner ModelCombiner { get; set; } = EnsembleCreatorScoreCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index a6d1f50668..16925eec02 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -5,15 +5,20 @@ Data.PredictorModelArrayConverter Create an array variable of IPredictorModel Mi Data.TextLoader Import a dataset from a text file Microsoft.ML.Runtime.EntryPoints.ImportTextData TextLoader Microsoft.ML.Runtime.EntryPoints.ImportTextData+LoaderInput Microsoft.ML.Runtime.EntryPoints.ImportTextData+Output Data.TransformModelArrayConverter Create an array variable of ITransformModel Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro MakeArray Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayITransformModelInput Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+ArrayITransformModelOutput Models.AnomalyDetectionEvaluator Evaluates an anomaly detection scored dataset. Microsoft.ML.Runtime.Data.Evaluate AnomalyDetection Microsoft.ML.Runtime.Data.AnomalyDetectionMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput +Models.AnomalyPipelineEnsemble Combine anomaly detection models into an ensemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator CreateAnomalyPipelineEnsemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator+PipelineAnomalyInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+AnomalyDetectionOutput Models.BinaryClassificationEvaluator Evaluates a binary classification scored dataset. Microsoft.ML.Runtime.Data.Evaluate Binary Microsoft.ML.Runtime.Data.BinaryClassifierMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+ClassificationEvaluateOutput Models.BinaryCrossValidator Cross validation for binary classification Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro CrossValidateBinary Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.CrossValidationBinaryMacro+Output] +Models.BinaryEnsemble Combine binary classifiers into an ensemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator CreateBinaryEnsemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator+ClassifierInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput +Models.BinaryPipelineEnsemble Combine binary classification models into an ensemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator CreateBinaryPipelineEnsemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator+PipelineClassifierInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Models.ClassificationEvaluator Evaluates a multi class classification scored dataset. Microsoft.ML.Runtime.Data.Evaluate MultiClass Microsoft.ML.Runtime.Data.MultiClassMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+ClassificationEvaluateOutput Models.ClusterEvaluator Evaluates a clustering scored dataset. Microsoft.ML.Runtime.Data.Evaluate Clustering Microsoft.ML.Runtime.Data.ClusteringMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput Models.CrossValidationResultsCombiner Combine the metric data views returned from cross validation. Microsoft.ML.Runtime.EntryPoints.CrossValidationMacro CombineMetrics Microsoft.ML.Runtime.EntryPoints.CrossValidationMacro+CombineMetricsInput Microsoft.ML.Runtime.EntryPoints.CrossValidationMacro+CombinedOutput Models.CrossValidator Cross validation for general learning Microsoft.ML.Runtime.EntryPoints.CrossValidationMacro CrossValidate Microsoft.ML.Runtime.EntryPoints.CrossValidationMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.CrossValidationMacro+Output] Models.CrossValidatorDatasetSplitter Split the dataset into the specified number of cross-validation folds (train and test sets) Microsoft.ML.Runtime.EntryPoints.CVSplit Split Microsoft.ML.Runtime.EntryPoints.CVSplit+Input Microsoft.ML.Runtime.EntryPoints.CVSplit+Output Models.DatasetTransformer Applies a TransformModel to a dataset. Microsoft.ML.Runtime.EntryPoints.ModelOperations Apply Microsoft.ML.Runtime.EntryPoints.ModelOperations+ApplyTransformModelInput Microsoft.ML.Runtime.EntryPoints.ModelOperations+ApplyTransformModelOutput +Models.EnsembleSummary Summarize a pipeline ensemble predictor. Microsoft.ML.Runtime.Ensemble.EntryPoints.PipelineEnsemble Summarize Microsoft.ML.Runtime.EntryPoints.SummarizePredictor+Input Microsoft.ML.Runtime.Ensemble.EntryPoints.PipelineEnsemble+SummaryOutput Models.FixedPlattCalibrator Apply a Platt calibrator with a fixed slope and offset to an input model Microsoft.ML.Runtime.Internal.Calibration.Calibrate FixedPlatt Microsoft.ML.Runtime.Internal.Calibration.Calibrate+FixedPlattInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CalibratorOutput +Models.MultiClassPipelineEnsemble Combine multiclass classifiers into an ensemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator CreateMultiClassPipelineEnsemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator+PipelineClassifierInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput Models.MultiOutputRegressionEvaluator Evaluates a multi output regression scored dataset. Microsoft.ML.Runtime.Data.Evaluate MultiOutputRegression Microsoft.ML.Runtime.Data.MultiOutputRegressionMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput Models.NaiveCalibrator Apply a Naive calibrator to an input model Microsoft.ML.Runtime.Internal.Calibration.Calibrate Naive Microsoft.ML.Runtime.Internal.Calibration.Calibrate+NoArgumentsInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CalibratorOutput Models.OneVersusAll One-vs-All macro (OVA) Microsoft.ML.Runtime.EntryPoints.OneVersusAllMacro OVA Microsoft.ML.Runtime.EntryPoints.OneVersusAllMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.OneVersusAllMacro+Output] @@ -24,7 +29,9 @@ Models.PipelineSweeper AutoML pipeline sweeping optimzation macro. Microsoft.ML. Models.PlattCalibrator Apply a Platt calibrator to an input model Microsoft.ML.Runtime.Internal.Calibration.Calibrate Platt Microsoft.ML.Runtime.Internal.Calibration.Calibrate+NoArgumentsInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CalibratorOutput Models.QuantileRegressionEvaluator Evaluates a quantile regression scored dataset. Microsoft.ML.Runtime.Data.Evaluate QuantileRegression Microsoft.ML.Runtime.Data.QuantileRegressionMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput Models.RankerEvaluator Evaluates a ranking scored dataset. Microsoft.ML.Runtime.Data.Evaluate Ranking Microsoft.ML.Runtime.Data.RankerMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput +Models.RegressionEnsemble Combine regression models into an ensemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator CreateRegressionEnsemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator+RegressionInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Models.RegressionEvaluator Evaluates a regression scored dataset. Microsoft.ML.Runtime.Data.Evaluate Regression Microsoft.ML.Runtime.Data.RegressionMamlEvaluator+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+CommonEvaluateOutput +Models.RegressionPipelineEnsemble Combine regression models into an ensemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator CreateRegressionPipelineEnsemble Microsoft.ML.Runtime.EntryPoints.EnsembleCreator+PipelineRegressionInput Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Models.Summarizer Summarize a linear regression predictor. Microsoft.ML.Runtime.EntryPoints.SummarizePredictor Summarize Microsoft.ML.Runtime.EntryPoints.SummarizePredictor+Input Microsoft.ML.Runtime.EntryPoints.CommonOutputs+SummaryOutput Models.SweepResultExtractor Extracts the sweep result. Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro ExtractSweepResult Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+ResultInput Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+Output Models.TrainTestBinaryEvaluator Train test for binary classification Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro TrainTestBinary Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output] diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index ea1a86a2e8..21f2427b72 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -654,6 +654,57 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.AnomalyPipelineEnsemble", + "Desc": "Combine anomaly detection models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Average" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IAnomalyDetectionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.BinaryClassificationEvaluator", "Desc": "Evaluates a binary classification scored dataset.", @@ -967,6 +1018,122 @@ } ] }, + { + "Name": "Models.BinaryEnsemble", + "Desc": "Combine binary classifiers into an ensemble", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + }, + { + "Name": "ValidatePipelines", + "Type": "Bool", + "Desc": "Whether to validate that all the pipelines are identical", + "Aliases": [ + "validate" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Models.BinaryPipelineEnsemble", + "Desc": "Combine binary classification models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.ClassificationEvaluator", "Desc": "Evaluates a multi class classification scored dataset.", @@ -1694,6 +1861,43 @@ "ITransformInput" ] }, + { + "Name": "Models.EnsembleSummary", + "Desc": "Summarize a pipeline ensemble predictor.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor to summarize", + "Aliases": [ + "predictorModel" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "Summaries", + "Type": { + "Kind": "Array", + "ItemType": "DataView" + }, + "Desc": "The summaries of the individual predictors" + }, + { + "Name": "Stats", + "Type": { + "Kind": "Array", + "ItemType": "DataView" + }, + "Desc": "The model statistics of the individual predictors" + } + ] + }, { "Name": "Models.FixedPlattCalibrator", "Desc": "Apply a Platt calibrator with a fixed slope and offset to an input model", @@ -1776,6 +1980,58 @@ "ITrainerOutput" ] }, + { + "Name": "Models.MultiClassPipelineEnsemble", + "Desc": "Combine multiclass classifiers into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.MultiOutputRegressionEvaluator", "Desc": "Evaluates a multi output regression scored dataset.", @@ -2822,6 +3078,69 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.RegressionEnsemble", + "Desc": "Combine regression models into an ensemble", + "FriendlyName": "Regression Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + }, + { + "Name": "ValidatePipelines", + "Type": "Bool", + "Desc": "Whether to validate that all the pipelines are identical", + "Aliases": [ + "validate" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.RegressionEvaluator", "Desc": "Evaluates a regression scored dataset.", @@ -2944,6 +3263,57 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.RegressionPipelineEnsemble", + "Desc": "Combine regression models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.Summarizer", "Desc": "Summarize a linear regression predictor.", diff --git a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj index 9f38858721..eab101ddbc 100644 --- a/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj +++ b/test/Microsoft.ML.Core.Tests/Microsoft.ML.Core.Tests.csproj @@ -8,6 +8,7 @@ + diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 60e79a943d..a1d51b574f 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -10,12 +10,15 @@ using Microsoft.ML.Runtime.Core.Tests.UnitTests; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.EntryPoints.JsonUtils; using Microsoft.ML.Runtime.FastTree; +using Microsoft.ML.Runtime.Internal.Calibration; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Runtime.PCA; +using Microsoft.ML.Runtime.TextAnalytics; using Newtonsoft.Json; using Newtonsoft.Json.Linq; using Xunit; @@ -29,69 +32,41 @@ public TestEntryPoints(ITestOutputHelper output) : base(output) { } - [Fact] - public void EntryPointTrainTestSplit() + private IDataView GetBreastCancerDataView() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - /*var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input - { InputFile = inputFile, CustomSchema = "col=Label:0 col=Features:TX:1-9" }).Data;*/ - - var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + return ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() { Arguments = { - SeparatorChars = new []{',' }, - HasHeader = true, Column = new[] { new TextLoader.Column() { Name = "Label", Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }, - Type = Runtime.Data.DataKind.Text + Type = Runtime.Data.DataKind.R4 }, new TextLoader.Column() { Name = "Features", Source = new [] { new TextLoader.Range() { Min = 1, Max = 9} }, - Type = Runtime.Data.DataKind.Text + Type = Runtime.Data.DataKind.R4 } } }, InputFile = inputFile }).Data; - - var splitOutput = TrainTestSplit.Split(Env, new TrainTestSplit.Input { Data = dataView, Fraction = 0.9f }); - - int totalRows = CountRows(dataView); - int trainRows = CountRows(splitOutput.TrainData); - int testRows = CountRows(splitOutput.TestData); - - Assert.Equal(totalRows, trainRows + testRows); - Assert.Equal(0.9, (double)trainRows / totalRows, 1); - } - - private static int CountRows(IDataView dataView) - { - int totalRows = 0; - using (var cursor = dataView.GetRowCursor(col => false)) - { - while (cursor.MoveNext()) - totalRows++; - } - - return totalRows; } - [Fact()] - public void EntryPointFeatureCombiner() + private IDataView GetBreastCancerDataviewWithTextColumns() { var dataPath = GetDataPath("breast-cancer.txt"); var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + return ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() { Arguments = { @@ -128,6 +103,39 @@ public void EntryPointFeatureCombiner() InputFile = inputFile }).Data; + } + + + [Fact] + public void EntryPointTrainTestSplit() + { + var dataView = GetBreastCancerDataView(); + var splitOutput = TrainTestSplit.Split(Env, new TrainTestSplit.Input { Data = dataView, Fraction = 0.9f }); + + int totalRows = CountRows(dataView); + int trainRows = CountRows(splitOutput.TrainData); + int testRows = CountRows(splitOutput.TestData); + + Assert.Equal(totalRows, trainRows + testRows); + Assert.Equal(0.9, (double)trainRows / totalRows, 1); + } + + private static int CountRows(IDataView dataView) + { + int totalRows = 0; + using (var cursor = dataView.GetRowCursor(col => false)) + { + while (cursor.MoveNext()) + totalRows++; + } + + return totalRows; + } + + [Fact()] + public void EntryPointFeatureCombiner() + { + var dataView = GetBreastCancerDataviewWithTextColumns(); dataView = Env.CreateTransform("Term{col=F1}", dataView); var result = FeatureCombiner.PrepareFeatures(Env, new FeatureCombiner.FeatureCombinerInput() { Data = dataView, Features = new[] { "F1", "F2", "Rest" } }).OutputData; @@ -146,48 +154,8 @@ public void EntryPointFeatureCombiner() [Fact] public void EntryPointScoring() { - var dataPath = GetDataPath("breast-cancer.txt"); - var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() - { - Arguments = - { - HasHeader = true, - Column = new[] - { - new TextLoader.Column() - { - Name = "Label", - Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} } - }, - - new TextLoader.Column() - { - Name = "F1", - Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, - Type = Runtime.Data.DataKind.Text - }, - - new TextLoader.Column() - { - Name = "F2", - Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, - Type = Runtime.Data.DataKind.I4 - }, - - new TextLoader.Column() - { - Name = "Rest", - Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } - } - } - }, - - InputFile = inputFile - }).Data; - + var dataView = GetBreastCancerDataView(); dataView = Env.CreateTransform("Term{col=F1}", dataView); - var trainData = FeatureCombiner.PrepareFeatures(Env, new FeatureCombiner.FeatureCombinerInput() { Data = dataView, Features = new[] { "F1", "F2", "Rest" } }); var lrModel = LogisticRegression.TrainBinary(Env, new LogisticRegression.Arguments { TrainingData = trainData.OutputData }).PredictorModel; var model = ModelOperations.CombineTwoModels(Env, new ModelOperations.SimplePredictorModelInput() { TransformModel = trainData.Model, PredictorModel = lrModel }).PredictorModel; @@ -206,45 +174,7 @@ public void EntryPointScoring() [Fact] public void EntryPointApplyModel() { - var dataPath = GetDataPath("breast-cancer.txt"); - var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() - { - Arguments = - { - HasHeader = true, - Column = new[] - { - new TextLoader.Column() - { - Name = "Label", - Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }, - }, - - new TextLoader.Column() - { - Name = "F1", - Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, - Type = Runtime.Data.DataKind.Text - }, - - new TextLoader.Column() - { - Name = "F2", - Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, - Type = Runtime.Data.DataKind.I4 - }, - - new TextLoader.Column() - { - Name = "Rest", - Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } - } - } - }, - - InputFile = inputFile - }).Data; + var dataView = GetBreastCancerDataView(); dataView = Env.CreateTransform("Term{col=F1}", dataView); @@ -258,50 +188,7 @@ public void EntryPointApplyModel() [Fact] public void EntryPointCaching() { - var dataPath = GetDataPath("breast-cancer.txt"); - var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - /*var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, - CustomSchema = "col=Label:0 col=F1:TX:1 col=F2:I4:2 col=Rest:3-9" }).Data; - */ - - var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() - { - Arguments = - { - SeparatorChars = new []{',' }, - HasHeader = true, - Column = new[] - { - new TextLoader.Column() - { - Name = "Label", - Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} } - }, - - new TextLoader.Column() - { - Name = "F1", - Source = new [] { new TextLoader.Range() { Min = 1, Max = 1} }, - Type = Runtime.Data.DataKind.Text - }, - - new TextLoader.Column() - { - Name = "F2", - Source = new [] { new TextLoader.Range() { Min = 2, Max = 2} }, - Type = Runtime.Data.DataKind.I4 - }, - - new TextLoader.Column() - { - Name = "Rest", - Source = new [] { new TextLoader.Range() { Min = 3, Max = 9} } - } - } - }, - - InputFile = inputFile - }).Data; + var dataView = GetBreastCancerDataviewWithTextColumns(); dataView = Env.CreateTransform("Term{col=F1}", dataView); @@ -480,6 +367,128 @@ public void EntryPointInputArgsChecks() EntryPointUtils.CheckInputArgs(Env, input); } + [Fact] + public void EntryPointCreateEnsemble() + { + var dataView = GetBreastCancerDataView(); + const int nModels = 5; + var splitOutput = CVSplit.Split(Env, new CVSplit.Input { Data = dataView, NumFolds = nModels + 1 }); + var predictorModels = new IPredictorModel[nModels]; + var individualScores = new IDataView[nModels]; + for (int i = 0; i < nModels; i++) + { + var data = splitOutput.TrainData[i]; + var lrInput = new LogisticRegression.Arguments + { + TrainingData = data, + L1Weight = (Single)0.1 * i, + L2Weight = (Single)0.01 * (1 + i), + NormalizeFeatures = NormalizeOption.No + }; + predictorModels[i] = LogisticRegression.TrainBinary(Env, lrInput).PredictorModel; + individualScores[i] = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = predictorModels[i] }) + .ScoredData; + + individualScores[i] = new CopyColumnsTransform(Env, + new CopyColumnsTransform.Arguments() + { + Column = new[] + { + new CopyColumnsTransform.Column() + { + Name = MetadataUtils.Const.ScoreValueKind.Score + i, + Source = MetadataUtils.Const.ScoreValueKind.Score + }, + } + }, individualScores[i]); + individualScores[i] = new DropColumnsTransform(Env, + new DropColumnsTransform.Arguments() { Column = new[] { MetadataUtils.Const.ScoreValueKind.Score } }, + individualScores[i]); + } + + var avgEnsembleInput = new EnsembleCreator.ClassifierInput { Models = predictorModels, ModelCombiner = EnsembleCreator.ClassifierCombiner.Average }; + var avgEnsemble = EnsembleCreator.CreateBinaryEnsemble(Env, avgEnsembleInput).PredictorModel; + var avgScored = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = avgEnsemble }).ScoredData; + + var medEnsembleInput = new EnsembleCreator.ClassifierInput { Models = predictorModels }; + var medEnsemble = EnsembleCreator.CreateBinaryEnsemble(Env, medEnsembleInput).PredictorModel; + var medScored = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = medEnsemble }).ScoredData; + + var regEnsembleInput = new EnsembleCreator.RegressionInput() { Models = predictorModels }; + var regEnsemble = EnsembleCreator.CreateRegressionEnsemble(Env, regEnsembleInput).PredictorModel; + var regScored = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = medEnsemble }).ScoredData; + + var zippedScores = ZipDataView.Create(Env, individualScores); + + var avgComb = new Average(Env).GetCombiner(); + var medComb = new Median(Env).GetCombiner(); + using (var curs1 = avgScored.GetRowCursor(col => true)) + using (var curs2 = medScored.GetRowCursor(col => true)) + using (var curs3 = regScored.GetRowCursor(col => true)) + using (var curs4 = zippedScores.GetRowCursor(col => true)) + { + var found = curs1.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out int scoreCol); + Assert.True(found); + var avgScoreGetter = curs1.GetGetter(scoreCol); + + found = curs2.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out scoreCol); + Assert.True(found); + var medScoreGetter = curs2.GetGetter(scoreCol); + + found = curs3.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out scoreCol); + Assert.True(found); + var regScoreGetter = curs3.GetGetter(scoreCol); + + var individualScoreGetters = new ValueGetter[nModels]; + for (int i = 0; i < nModels; i++) + { + curs4.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score + i, out scoreCol); + individualScoreGetters[i] = curs4.GetGetter(scoreCol); + } + + var scoreBuffer = new Single[nModels]; + while (curs1.MoveNext()) + { + var move = curs2.MoveNext(); + Assert.True(move); + move = curs3.MoveNext(); + Assert.True(move); + move = curs4.MoveNext(); + Assert.True(move); + + Single score = 0; + avgScoreGetter(ref score); + for (int i = 0; i < nModels; i++) + individualScoreGetters[i](ref scoreBuffer[i]); + Single avgScore = 0; + avgComb(ref avgScore, scoreBuffer, null); + Assert.Equal(score, avgScore); + + medScoreGetter(ref score); + Single medScore = 0; + medComb(ref medScore, scoreBuffer, null); + Assert.Equal(score, medScore); + + regScoreGetter(ref score); + Assert.Equal(score, medScore); + } + var moved = curs2.MoveNext(); + Assert.False(moved); + moved = curs3.MoveNext(); + Assert.False(moved); + moved = curs4.MoveNext(); + Assert.False(moved); + } + } + [Fact] public void EntryPointOptionalParams() { @@ -629,46 +638,681 @@ public void EntryPointExecGraphCommand() // Assert.NotNull(model); //} - //[Fact] - //public void EntryPointCalibrate() - //{ - // var dataPath = GetDataPath("breast-cancer.txt"); - // var inputFile = new SimpleFileHandle(Env, dataPath, false, false); - // var dataView = ImportTextData.ImportText(Env, new ImportTextData.Input { InputFile = inputFile, CustomSchema = "col=Label:0 col=Features:1-9" }).Data; - - // var splitOutput = CVSplit.Split(Env, new CVSplit.Input { Data = dataView, NumFolds = 3 }); - - // var lrModel = LogisticRegression.TrainBinary(Env, new LogisticRegression.Arguments { TrainingData = splitOutput.TestData[0] }).PredictorModel; - // var calibratedLrModel = Calibrate.FixedPlatt(Env, - // new Calibrate.FixedPlattInput { Data = splitOutput.TestData[1], UncalibratedPredictorModel = lrModel }).PredictorModel; - - // var scored1 = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = lrModel }).ScoredData; - // scored1 = ScoreModel.SelectColumns(Env, new ScoreModel.ScoreColumnSelectorInput() { Data = scored1, ExtraColumns = new[] { "Label" } }).OutputData; - - // var scored2 = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = calibratedLrModel }).ScoredData; - // scored2 = ScoreModel.SelectColumns(Env, new ScoreModel.ScoreColumnSelectorInput() { Data = scored2, ExtraColumns = new[] { "Label" } }).OutputData; - - // Assert.Equal(4, scored1.Schema.ColumnCount); - // CheckSameValues(scored1, scored2); - - // var input = new Calibrate.NoArgumentsInput() { Data = splitOutput.TestData[1], UncalibratedPredictorModel = lrModel }; - // calibratedLrModel = Calibrate.Platt(Env, input).PredictorModel; - // calibratedLrModel = Calibrate.Naive(Env, input).PredictorModel; - // calibratedLrModel = Calibrate.Pav(Env, input).PredictorModel; - - // // This tests that the SchemaBindableCalibratedPredictor doesn't get confused if its sub-predictor is already calibrated. - // var fastForest = new FastForestClassification(Env, new FastForestClassification.Arguments()); - // var rmd = RoleMappedData.Create(splitOutput.TrainData[0], - // RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Feature, "Features"), - // RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, "Label")); - // fastForest.Train(rmd); - // var ffModel = new PredictorModel(Env, rmd, splitOutput.TrainData[0], fastForest.CreatePredictor()); - // var calibratedFfModel = Calibrate.Platt(Env, - // new Calibrate.NoArgumentsInput() { Data = splitOutput.TestData[0], UncalibratedPredictorModel = ffModel }).PredictorModel; - // var twiceCalibratedFfModel = Calibrate.Platt(Env, - // new Calibrate.NoArgumentsInput() { Data = splitOutput.TestData[0], UncalibratedPredictorModel = calibratedFfModel }).PredictorModel; - // var scoredFf = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = twiceCalibratedFfModel }).ScoredData; - //} + [Fact] + public void EntryPointCalibrate() + { + var dataView = GetBreastCancerDataView(); + + var splitOutput = CVSplit.Split(Env, new CVSplit.Input { Data = dataView, NumFolds = 3 }); + + var lrModel = LogisticRegression.TrainBinary(Env, new LogisticRegression.Arguments { TrainingData = splitOutput.TestData[0] }).PredictorModel; + var calibratedLrModel = Calibrate.FixedPlatt(Env, + new Calibrate.FixedPlattInput { Data = splitOutput.TestData[1], UncalibratedPredictorModel = lrModel }).PredictorModel; + + var scored1 = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = lrModel }).ScoredData; + scored1 = ScoreModel.SelectColumns(Env, new ScoreModel.ScoreColumnSelectorInput() { Data = scored1, ExtraColumns = new[] { "Label" } }).OutputData; + + var scored2 = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = calibratedLrModel }).ScoredData; + scored2 = ScoreModel.SelectColumns(Env, new ScoreModel.ScoreColumnSelectorInput() { Data = scored2, ExtraColumns = new[] { "Label" } }).OutputData; + + Assert.Equal(4, scored1.Schema.ColumnCount); + CheckSameValues(scored1, scored2); + + var input = new Calibrate.NoArgumentsInput() { Data = splitOutput.TestData[1], UncalibratedPredictorModel = lrModel }; + calibratedLrModel = Calibrate.Platt(Env, input).PredictorModel; + calibratedLrModel = Calibrate.Naive(Env, input).PredictorModel; + calibratedLrModel = Calibrate.Pav(Env, input).PredictorModel; + + // This tests that the SchemaBindableCalibratedPredictor doesn't get confused if its sub-predictor is already calibrated. + var fastForest = new FastForestClassification(Env, new FastForestClassification.Arguments()); + var rmd = RoleMappedData.Create(splitOutput.TrainData[0], + RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Feature, "Features"), + RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, "Label")); + fastForest.Train(rmd); + var ffModel = new PredictorModel(Env, rmd, splitOutput.TrainData[0], fastForest.CreatePredictor()); + var calibratedFfModel = Calibrate.Platt(Env, + new Calibrate.NoArgumentsInput() { Data = splitOutput.TestData[0], UncalibratedPredictorModel = ffModel }).PredictorModel; + var twiceCalibratedFfModel = Calibrate.Platt(Env, + new Calibrate.NoArgumentsInput() { Data = splitOutput.TestData[0], UncalibratedPredictorModel = calibratedFfModel }).PredictorModel; + var scoredFf = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = twiceCalibratedFfModel }).ScoredData; + } + + + [Fact] + public void EntryPointPipelineEnsemble() + { + var dataView = GetBreastCancerDataView(); + const int nModels = 5; + var splitOutput = CVSplit.Split(Env, new CVSplit.Input { Data = dataView, NumFolds = nModels + 1 }); + var predictorModels = new IPredictorModel[nModels]; + var individualScores = new IDataView[nModels]; + for (int i = 0; i < nModels; i++) + { + var data = splitOutput.TrainData[i]; + data = new RffTransform(Env, new RffTransform.Arguments() + { + Column = new[] + { + new RffTransform.Column() {Name = "Features1", Source = "Features"}, + new RffTransform.Column() {Name = "Features2", Source = "Features"}, + }, + NewDim = 10, + UseSin = false + }, data); + data = new ConcatTransform(Env, new ConcatTransform.Arguments() + { + Column = new[] { new ConcatTransform.Column() { Name = "Features", Source = new[] { "Features1", "Features2" } } } + }, data); + + data = new TermTransform(Env, new TermTransform.Arguments() + { + Column = new[] + { + new TermTransform.Column() + { + Name = "Label", + Source = "Label", + Sort = TermTransform.SortOrder.Value + } + } + }, data); + + var lrInput = new LogisticRegression.Arguments + { + TrainingData = data, + L1Weight = (Single)0.1 * i, + L2Weight = (Single)0.01 * (1 + i), + NormalizeFeatures = NormalizeOption.Yes + }; + predictorModels[i] = LogisticRegression.TrainBinary(Env, lrInput).PredictorModel; + var transformModel = new TransformModel(Env, data, splitOutput.TrainData[i]); + + predictorModels[i] = ModelOperations.CombineTwoModels(Env, + new ModelOperations.SimplePredictorModelInput() + { PredictorModel = predictorModels[i], TransformModel = transformModel }).PredictorModel; + + individualScores[i] = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = predictorModels[i] }) + .ScoredData; + } + + var binaryEnsembleModel = EnsembleCreator.CreateBinaryPipelineEnsemble(Env, + new EnsembleCreator.PipelineClassifierInput() + { + ModelCombiner = EntryPoints.EnsembleCreator.ClassifierCombiner.Average, + Models = predictorModels + }).PredictorModel; + var binaryEnsembleCalibrated = Calibrate.Platt(Env, + new Calibrate.NoArgumentsInput() + { + Data = splitOutput.TestData[nModels], + UncalibratedPredictorModel = binaryEnsembleModel + }).PredictorModel; + var binaryScored = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = binaryEnsembleModel + }).ScoredData; + var binaryScoredCalibrated = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = binaryEnsembleCalibrated + }).ScoredData; + + var regressionEnsembleModel = EntryPoints.EnsembleCreator.CreateRegressionPipelineEnsemble(Env, + new EntryPoints.EnsembleCreator.PipelineRegressionInput() + { + ModelCombiner = EntryPoints.EnsembleCreator.ScoreCombiner.Average, + Models = predictorModels + }).PredictorModel; + var regressionScored = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = regressionEnsembleModel + }).ScoredData; + + var anomalyEnsembleModel = EntryPoints.EnsembleCreator.CreateAnomalyPipelineEnsemble(Env, + new EntryPoints.EnsembleCreator.PipelineAnomalyInput() + { + ModelCombiner = EnsembleCreator.ScoreCombiner.Average, + Models = predictorModels + }).PredictorModel; + var anomalyScored = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = anomalyEnsembleModel + }).ScoredData; + + // Make sure the scorers have the correct types. + var hasScoreCol = binaryScored.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out int scoreIndex); + Assert.True(hasScoreCol, "Data scored with binary ensemble does not have a score column"); + var type = binaryScored.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.ScoreColumnKind, scoreIndex); + Assert.True(type != null && type.IsText, "Binary ensemble scored data does not have correct type of metadata."); + var kind = default(DvText); + binaryScored.Schema.GetMetadata(MetadataUtils.Kinds.ScoreColumnKind, scoreIndex, ref kind); + Assert.True(kind.EqualsStr(MetadataUtils.Const.ScoreColumnKind.BinaryClassification), + $"Binary ensemble scored data column type should be '{MetadataUtils.Const.ScoreColumnKind.BinaryClassification}', but is instead '{kind}'"); + + hasScoreCol = regressionScored.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out scoreIndex); + Assert.True(hasScoreCol, "Data scored with regression ensemble does not have a score column"); + type = regressionScored.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.ScoreColumnKind, scoreIndex); + Assert.True(type != null && type.IsText, "Regression ensemble scored data does not have correct type of metadata."); + regressionScored.Schema.GetMetadata(MetadataUtils.Kinds.ScoreColumnKind, scoreIndex, ref kind); + Assert.True(kind.EqualsStr(MetadataUtils.Const.ScoreColumnKind.Regression), + $"Regression ensemble scored data column type should be '{MetadataUtils.Const.ScoreColumnKind.Regression}', but is instead '{kind}'"); + + hasScoreCol = anomalyScored.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out scoreIndex); + Assert.True(hasScoreCol, "Data scored with anomaly detection ensemble does not have a score column"); + type = anomalyScored.Schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.ScoreColumnKind, scoreIndex); + Assert.True(type != null && type.IsText, "Anomaly detection ensemble scored data does not have correct type of metadata."); + anomalyScored.Schema.GetMetadata(MetadataUtils.Kinds.ScoreColumnKind, scoreIndex, ref kind); + Assert.True(kind.EqualsStr(MetadataUtils.Const.ScoreColumnKind.AnomalyDetection), + $"Anomaly detection ensemble scored data column type should be '{MetadataUtils.Const.ScoreColumnKind.AnomalyDetection}', but is instead '{kind}'"); + + var modelPath = DeleteOutputPath("SavePipe", "PipelineEnsembleModel.zip"); + using (var file = Env.CreateOutputFile(modelPath)) + using (var strm = file.CreateWriteStream()) + regressionEnsembleModel.Save(Env, strm); + + IPredictorModel loadedFromSaved; + using (var file = Env.OpenInputFile(modelPath)) + using (var strm = file.OpenReadStream()) + loadedFromSaved = new PredictorModel(Env, strm); + + var scoredFromSaved = ScoreModel.Score(Env, + new ScoreModel.Input() + { + Data = splitOutput.TestData[nModels], + PredictorModel = loadedFromSaved + }).ScoredData; + + using (var cursReg = regressionScored.GetRowCursor(col => true)) + using (var cursBin = binaryScored.GetRowCursor(col => true)) + using (var cursBinCali = binaryScoredCalibrated.GetRowCursor(col => true)) + using (var cursAnom = anomalyScored.GetRowCursor(col => true)) + using (var curs0 = individualScores[0].GetRowCursor(col => true)) + using (var curs1 = individualScores[1].GetRowCursor(col => true)) + using (var curs2 = individualScores[2].GetRowCursor(col => true)) + using (var curs3 = individualScores[3].GetRowCursor(col => true)) + using (var curs4 = individualScores[4].GetRowCursor(col => true)) + using (var cursSaved = scoredFromSaved.GetRowCursor(col => true)) + { + var good = curs0.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out int col); + Assert.True(good); + var getter0 = curs0.GetGetter(col); + good = curs1.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter1 = curs1.GetGetter(col); + good = curs2.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter2 = curs2.GetGetter(col); + good = curs3.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter3 = curs3.GetGetter(col); + good = curs4.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter4 = curs4.GetGetter(col); + good = cursReg.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterReg = cursReg.GetGetter(col); + good = cursBin.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterBin = cursBin.GetGetter(col); + good = cursBinCali.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterBinCali = cursBinCali.GetGetter(col); + good = cursSaved.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterSaved = cursSaved.GetGetter(col); + good = cursAnom.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterAnom = cursAnom.GetGetter(col); + + var c = new Average(Env).GetCombiner(); + while (cursReg.MoveNext()) + { + Single score = 0; + getterReg(ref score); + Assert.True(curs0.MoveNext()); + Assert.True(curs1.MoveNext()); + Assert.True(curs2.MoveNext()); + Assert.True(curs3.MoveNext()); + Assert.True(curs4.MoveNext()); + Assert.True(cursBin.MoveNext()); + Assert.True(cursBinCali.MoveNext()); + Assert.True(cursSaved.MoveNext()); + Assert.True(cursAnom.MoveNext()); + Single[] score0 = new Single[5]; + getter0(ref score0[0]); + getter1(ref score0[1]); + getter2(ref score0[2]); + getter3(ref score0[3]); + getter4(ref score0[4]); + Single scoreBin = 0; + Single scoreBinCali = 0; + Single scoreSaved = 0; + Single scoreAnom = 0; + getterBin(ref scoreBin); + getterBinCali(ref scoreBinCali); + getterSaved(ref scoreSaved); + getterAnom(ref scoreAnom); + Assert.True(Single.IsNaN(scoreBin) && Single.IsNaN(score) || scoreBin == score); + Assert.True(Single.IsNaN(scoreBinCali) && Single.IsNaN(score) || scoreBinCali == score); + Assert.True(Single.IsNaN(scoreSaved) && Single.IsNaN(score) || scoreSaved == score); + Assert.True(Single.IsNaN(scoreAnom) && Single.IsNaN(score) || scoreAnom == score); + + Single avg = 0; + c(ref avg, score0, null); + Assert.True(Single.IsNaN(avg) && Single.IsNaN(score) || avg == score); + } + Assert.False(curs0.MoveNext()); + Assert.False(curs1.MoveNext()); + Assert.False(curs2.MoveNext()); + Assert.False(curs3.MoveNext()); + Assert.False(curs4.MoveNext()); + Assert.False(cursBin.MoveNext()); + Assert.False(cursBinCali.MoveNext()); + Assert.False(cursSaved.MoveNext()); + Assert.False(cursAnom.MoveNext()); + } + } + + + [Fact] + public void EntryPointPipelineEnsembleText() + { + var dataPath = GetDataPath("lm.sample.txt"); + var inputFile = new SimpleFileHandle(Env, dataPath, false, false); + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + HasHeader = true, + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }, + Type = Runtime.Data.DataKind.TX + }, + + new TextLoader.Column() + { + Name = "Text", + Source = new [] { new TextLoader.Range() { Min = 3, Max = 3} }, + Type = Runtime.Data.DataKind.TX + } + } + }, + + InputFile = inputFile + }).Data; + + ValueMapper labelToBinary = + (ref DvText src, ref DvBool dst) => + { + if (src.EqualsStr("Sport")) + dst = DvBool.True; + else + dst = DvBool.False; + }; + dataView = LambdaColumnMapper.Create(Env, "TextToBinaryLabel", dataView, "Label", "Label", + TextType.Instance, BoolType.Instance, labelToBinary); + + const int nModels = 5; + var splitOutput = CVSplit.Split(Env, new CVSplit.Input { Data = dataView, NumFolds = nModels + 1 }); + var predictorModels = new IPredictorModel[nModels]; + var individualScores = new IDataView[nModels]; + for (int i = 0; i < nModels; i++) + { + var data = splitOutput.TrainData[i]; + if (i % 2 == 0) + { + data = TextTransform.Create(Env, + new TextTransform.Arguments() + { + Column = new TextTransform.Column() { Name = "Features", Source = new[] { "Text" } }, + StopWordsRemover = new PredefinedStopWordsRemoverFactory() + }, data); + } + else + { + data = WordHashBagTransform.Create(Env, + new WordHashBagTransform.Arguments() + { + Column = + new[] { new WordHashBagTransform.Column() { Name = "Features", Source = new[] { "Text" } }, } + }, + data); + } + var lrInput = new LogisticRegression.Arguments + { + TrainingData = data, + L1Weight = (Single)0.1 * i, + L2Weight = (Single)0.01 * (1 + i), + NormalizeFeatures = NormalizeOption.Yes + }; + predictorModels[i] = LogisticRegression.TrainBinary(Env, lrInput).PredictorModel; + var transformModel = new TransformModel(Env, data, splitOutput.TrainData[i]); + + predictorModels[i] = ModelOperations.CombineTwoModels(Env, + new ModelOperations.SimplePredictorModelInput() + { PredictorModel = predictorModels[i], TransformModel = transformModel }).PredictorModel; + + individualScores[i] = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = predictorModels[i] }) + .ScoredData; + } + + var binaryEnsembleModel = EnsembleCreator.CreateBinaryPipelineEnsemble(Env, + new EnsembleCreator.PipelineClassifierInput() + { + ModelCombiner = EnsembleCreator.ClassifierCombiner.Average, + Models = predictorModels + }).PredictorModel; + var binaryEnsembleCalibrated = Calibrate.Platt(Env, + new Calibrate.NoArgumentsInput() + { + Data = splitOutput.TestData[nModels], + UncalibratedPredictorModel = binaryEnsembleModel + }).PredictorModel; + var binaryScored = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = binaryEnsembleModel + }).ScoredData; + var binaryScoredCalibrated = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = binaryEnsembleCalibrated + }).ScoredData; + + var regressionEnsembleModel = EnsembleCreator.CreateRegressionPipelineEnsemble(Env, + new EnsembleCreator.PipelineRegressionInput() + { + ModelCombiner = EnsembleCreator.ScoreCombiner.Average, + Models = predictorModels + }).PredictorModel; + var regressionScored = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = regressionEnsembleModel + }).ScoredData; + + var modelPath = DeleteOutputPath("SavePipe", "PipelineEnsembleModel.zip"); + using (var file = Env.CreateOutputFile(modelPath)) + using (var strm = file.CreateWriteStream()) + regressionEnsembleModel.Save(Env, strm); + + IPredictorModel loadedFromSaved; + using (var file = Env.OpenInputFile(modelPath)) + using (var strm = file.OpenReadStream()) + loadedFromSaved = new PredictorModel(Env, strm); + + var scoredFromSaved = ScoreModel.Score(Env, + new ScoreModel.Input() + { + Data = splitOutput.TestData[nModels], + PredictorModel = loadedFromSaved + }).ScoredData; + + using (var cursReg = regressionScored.GetRowCursor(col => true)) + using (var cursBin = binaryScored.GetRowCursor(col => true)) + using (var cursBinCali = binaryScoredCalibrated.GetRowCursor(col => true)) + using (var curs0 = individualScores[0].GetRowCursor(col => true)) + using (var curs1 = individualScores[1].GetRowCursor(col => true)) + using (var curs2 = individualScores[2].GetRowCursor(col => true)) + using (var curs3 = individualScores[3].GetRowCursor(col => true)) + using (var curs4 = individualScores[4].GetRowCursor(col => true)) + using (var cursSaved = scoredFromSaved.GetRowCursor(col => true)) + { + var good = curs0.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out int col); + Assert.True(good); + var getter0 = curs0.GetGetter(col); + good = curs1.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter1 = curs1.GetGetter(col); + good = curs2.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter2 = curs2.GetGetter(col); + good = curs3.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter3 = curs3.GetGetter(col); + good = curs4.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter4 = curs4.GetGetter(col); + good = cursReg.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterReg = cursReg.GetGetter(col); + good = cursBin.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterBin = cursBin.GetGetter(col); + good = cursBinCali.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterBinCali = cursBinCali.GetGetter(col); + good = cursSaved.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterSaved = cursSaved.GetGetter(col); + + var c = new Average(Env).GetCombiner(); + while (cursReg.MoveNext()) + { + Single score = 0; + getterReg(ref score); + Assert.True(curs0.MoveNext()); + Assert.True(curs1.MoveNext()); + Assert.True(curs2.MoveNext()); + Assert.True(curs3.MoveNext()); + Assert.True(curs4.MoveNext()); + Assert.True(cursBin.MoveNext()); + Assert.True(cursBinCali.MoveNext()); + Assert.True(cursSaved.MoveNext()); + Single[] score0 = new Single[5]; + getter0(ref score0[0]); + getter1(ref score0[1]); + getter2(ref score0[2]); + getter3(ref score0[3]); + getter4(ref score0[4]); + Single scoreBin = 0; + Single scoreBinCali = 0; + Single scoreSaved = 0; + getterBin(ref scoreBin); + getterBinCali(ref scoreBinCali); + getterSaved(ref scoreSaved); + Assert.True(Single.IsNaN(scoreBin) && Single.IsNaN(score) || scoreBin == score); + Assert.True(Single.IsNaN(scoreBinCali) && Single.IsNaN(score) || scoreBinCali == score); + Assert.True(Single.IsNaN(scoreSaved) && Single.IsNaN(score) || scoreSaved == score); + + Single avg = 0; + c(ref avg, score0, null); + Assert.True(Single.IsNaN(avg) && Single.IsNaN(score) || avg == score); + } + Assert.False(curs0.MoveNext()); + Assert.False(curs1.MoveNext()); + Assert.False(curs2.MoveNext()); + Assert.False(curs3.MoveNext()); + Assert.False(curs4.MoveNext()); + Assert.False(cursBin.MoveNext()); + Assert.False(cursBinCali.MoveNext()); + Assert.False(cursSaved.MoveNext()); + } + } + + [Fact] + public void EntryPointMulticlassPipelineEnsemble() + { + var dataPath = GetDataPath("iris.txt"); + var inputFile = new SimpleFileHandle(Env, dataPath, false, false); + var dataView = ImportTextData.TextLoader(Env, new ImportTextData.LoaderInput() + { + Arguments = + { + Column = new[] + { + new TextLoader.Column() + { + Name = "Label", + Source = new [] { new TextLoader.Range() { Min = 0, Max = 0} }, + Type = Runtime.Data.DataKind.R4 + }, + + new TextLoader.Column() + { + Name = "Features", + Source = new [] { new TextLoader.Range() { Min = 1, Max = 4} }, + Type = Runtime.Data.DataKind.R4 + } + } + }, + + InputFile = inputFile + }).Data; + + const int nModels = 5; + var splitOutput = CVSplit.Split(Env, new CVSplit.Input { Data = dataView, NumFolds = nModels + 1 }); + var predictorModels = new IPredictorModel[nModels]; + var individualScores = new IDataView[nModels]; + for (int i = 0; i < nModels; i++) + { + var data = splitOutput.TrainData[i]; + data = new RffTransform(Env, new RffTransform.Arguments() + { + Column = new[] + { + new RffTransform.Column() {Name = "Features1", Source = "Features"}, + new RffTransform.Column() {Name = "Features2", Source = "Features"}, + }, + NewDim = 10, + UseSin = false + }, data); + data = new ConcatTransform(Env, new ConcatTransform.Arguments() + { + Column = new[] { new ConcatTransform.Column() { Name = "Features", Source = new[] { "Features1", "Features2" } } } + }, data); + + var mlr = new MulticlassLogisticRegression(Env, new MulticlassLogisticRegression.Arguments()); + RoleMappedData rmd = RoleMappedData.Create(data, + RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Feature, "Features"), + RoleMappedSchema.CreatePair(RoleMappedSchema.ColumnRole.Label, "Label")); + mlr.Train(rmd); + + predictorModels[i] = new PredictorModel(Env, rmd, data, mlr.CreatePredictor()); + var transformModel = new TransformModel(Env, data, splitOutput.TrainData[i]); + + predictorModels[i] = ModelOperations.CombineTwoModels(Env, + new ModelOperations.SimplePredictorModelInput() + { PredictorModel = predictorModels[i], TransformModel = transformModel }).PredictorModel; + + individualScores[i] = + ScoreModel.Score(Env, + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = predictorModels[i] }) + .ScoredData; + } + + var mcEnsembleModel = EnsembleCreator.CreateMultiClassPipelineEnsemble(Env, + new EnsembleCreator.PipelineClassifierInput() + { + ModelCombiner = EnsembleCreator.ClassifierCombiner.Average, + Models = predictorModels + }).PredictorModel; + var mcScored = ScoreModel.Score(Env, + new ScoreModel.Input + { + Data = splitOutput.TestData[nModels], + PredictorModel = mcEnsembleModel + }).ScoredData; + + var modelPath = DeleteOutputPath("SavePipe", "PipelineEnsembleModel.zip"); + using (var file = Env.CreateOutputFile(modelPath)) + using (var strm = file.CreateWriteStream()) + mcEnsembleModel.Save(Env, strm); + + IPredictorModel loadedFromSaved; + using (var file = Env.OpenInputFile(modelPath)) + using (var strm = file.OpenReadStream()) + loadedFromSaved = new PredictorModel(Env, strm); + + var scoredFromSaved = ScoreModel.Score(Env, + new ScoreModel.Input() + { + Data = splitOutput.TestData[nModels], + PredictorModel = loadedFromSaved + }).ScoredData; + + using (var curs = mcScored.GetRowCursor(col => true)) + using (var cursSaved = scoredFromSaved.GetRowCursor(col => true)) + using (var curs0 = individualScores[0].GetRowCursor(col => true)) + using (var curs1 = individualScores[1].GetRowCursor(col => true)) + using (var curs2 = individualScores[2].GetRowCursor(col => true)) + using (var curs3 = individualScores[3].GetRowCursor(col => true)) + using (var curs4 = individualScores[4].GetRowCursor(col => true)) + { + var good = curs0.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out int col); + Assert.True(good); + var getter0 = curs0.GetGetter>(col); + good = curs1.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter1 = curs1.GetGetter>(col); + good = curs2.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter2 = curs2.GetGetter>(col); + good = curs3.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter3 = curs3.GetGetter>(col); + good = curs4.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter4 = curs4.GetGetter>(col); + good = curs.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getter = curs.GetGetter>(col); + good = cursSaved.Schema.TryGetColumnIndex(MetadataUtils.Const.ScoreValueKind.Score, out col); + Assert.True(good); + var getterSaved = cursSaved.GetGetter>(col); + + var c = new MultiAverage(Env, new MultiAverage.Arguments()).GetCombiner(); + VBuffer score = default(VBuffer); + VBuffer[] score0 = new VBuffer[5]; + VBuffer scoreSaved = default(VBuffer); + VBuffer avg = default(VBuffer); + VBuffer dense1 = default(VBuffer); + VBuffer dense2 = default(VBuffer); + while (curs.MoveNext()) + { + getter(ref score); + Assert.True(curs0.MoveNext()); + Assert.True(curs1.MoveNext()); + Assert.True(curs2.MoveNext()); + Assert.True(curs3.MoveNext()); + Assert.True(curs4.MoveNext()); + Assert.True(cursSaved.MoveNext()); + getter0(ref score0[0]); + getter1(ref score0[1]); + getter2(ref score0[2]); + getter3(ref score0[3]); + getter4(ref score0[4]); + getterSaved(ref scoreSaved); + Assert.True(CompareVBuffers(ref scoreSaved, ref score, ref dense1, ref dense2)); + c(ref avg, score0, null); + Assert.True(CompareVBuffers(ref avg, ref score, ref dense1, ref dense2)); + } + Assert.False(curs0.MoveNext()); + Assert.False(curs1.MoveNext()); + Assert.False(curs2.MoveNext()); + Assert.False(curs3.MoveNext()); + Assert.False(curs4.MoveNext()); + Assert.False(cursSaved.MoveNext()); + } + } private static bool CompareVBuffers(ref VBuffer v1, ref VBuffer v2, ref VBuffer dense1, ref VBuffer dense2) { @@ -2653,7 +3297,7 @@ public void EntryPointLinearPredictorSummary() NormalizeFeatures = NormalizeOption.Yes, NumThreads = 1, // REVIEW: this depends on MKL library which is not available - ShowTrainingStats = false + ShowTrainingStats = false }; var model = LogisticRegression.TrainBinary(Env, lrInput).PredictorModel; diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index c4ccb76ae9..8efa59e99e 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -4,6 +4,7 @@ + diff --git a/test/data/lm.sample.txt b/test/data/lm.sample.txt new file mode 100644 index 0000000000..4aa28ab03f --- /dev/null +++ b/test/data/lm.sample.txt @@ -0,0 +1,120 @@ +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,793077,00 url-pagepartsplitname-0,1518,793077,00 url-lastpartpagename-html#ref=rss weg fuer milliardenhilfe frei vor dem parlamentsgebaeude toben strassenkaempfe zwischen demonstranten drinnen haben die griechischen abgeordneten das drastische sparpaket am abend endgueltig beschlossen die entscheidung ist eine wichtige voraussetzung fuer die auszahlung von weiteren acht milliarden euro hilfsgeldern athen das griechische parlament hat einem umfassenden sparpaket endgueltig zugestimmt +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-versicherer url-pathpart-460206 url-domainname-www.wiwo.de url-domainprefix-www url-domainprefix-www.wiwo url-domaintype-de url-domainsuffix-wiwo.de versicherer lediglich air worldwide ein spezialist fuer risikomodelle wagte sich schnell mit einer ersten schaetzung vor +Wirtschaft de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-198684 url-pagepartsplitname-198684 url-lastpartpagename-html neue todesfaelle durch ehec hannover berlin dpa nd in niedersachsen hat der ehec erreger erneut ein todesopfer gefordert eine ueber 70 jahre alte frau aus cuxhaven sei am freitagmorgen in einem bremer krankenhaus an hus dem haemolytisch uraemischen syndrom gestorben sagte ein sprecher des niedersaechsischen gesundheitsministeriums am freitag in hannover die infektion sei bei der frau bereits labordiagnostisch nachgewiesen zudem habe sich der tod einer 41 jaehrigen vom vergangenen mittwoch auf ehec zurueckfuehren lassen die frau stammte ebenfalls aus cuxhaven damit ist die zahl der bestaetigten todesfaelle durch den gefaehrlichen darmkeim ehec in deutschland auf fuenf gestiegen nach angaben des robert koch instituts rki sind von donnerstag auf freitag etwa 60 neue hus faelle gemeldet worden die gesundheitsbehoerden in kopenhagen und stockholm teilten mit in daenemark und schweden seien bisher 32 ehec faelle nachgewiesen worden alle betroffenen seien zuvor in deutschland auf reisen gewesen das aggressive bakterium ehec treibt seit mitte mai in deutschland sein unwesen das hus ist eine schwere verlaufsform bei der giftige stoffwechselprodukte des bakteriums zu nierenschaeden fuehren koennen auf der suche nach einer quelle fuer den erreger waren experten am donnerstag bei gurken aus spanien fuendig geworden +Wirtschaft de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-204736 url-pagepartsplitname-204736 url-lastpartpagename-html italiens schattenwirtschaft blueht in rom beginnt in diesen tagen die parlamentarische diskussion ueber das sparpaket das vergangene woche verabschiedet wurde darin werden in erster linie die aermeren zur kasse gebeten ein wichtiger wirtschaftszweig bleibt vollkommen ungeschoren +Wirtschaft de-DE url-pathpart-wirtschaft url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,785498,00 url-pagepartsplitname-0,1518,785498,00 url-lastpartpagename-html#ref=rss krise in griechenland die wirtschaftsaussichten fuer griechenland sind offenbar noch schlimmer als bislang angenommen +Wirtschaft de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-206132 url-pagepartsplitname-206132 url-lastpartpagename-html auf wackligen fuessen die sogenannte laenderklausel im gesetzentwurf zur abscheidung und speicherung von co2 bietet nach einem gutachten keine ausreichende rechtssicherheit auf der zielgeraden des bereits im bundestag verabschiedeten gesetzentwurfes zur speicherung und lagerung von kohlendioxid co2 haben sich am dienstag in kiel noch einmal die umweltverbaende von greenpeace und bund zu wort gemeldet und die sogenannte laenderklausel als ein womoeglich unsicheres mittel zur verhinderung der co2 verpressung bezeichnet diese biete juristische angriffsflaechen zudem gebe es ausserhalb der zwoelf seemeilen umfassenden aussenwirtschaftszone in der nordsee ohnehin keinen zugriff fuer eine laenderhoheit beide organisationen stellten ein entsprechendes rechtsgutachten der anwaeltin roda verheyen aus hamburg vor in dessen schlussfolgerung steht die forderung speziell an die beiden schwarz gelben landesregierungen von schleswig holstein und niedersachsen am 23 september im bundesrat gegen das ausgearbeitete ccs carbon capture and storage gesetz zu stimmen weil es den einzelnen laendern nur eine truegerische sicherheit biete dass es nicht doch zur erprobung oder anwendung der speichertechnologie komme greenpeace energieexpertin anike peters spricht in diesem zusammenhang von einem faulen deal die bund klimaexpertin tina loeffelsend nennt es eine beruhigungspille die der bevoelkerung verabreicht werde bereits 2017 erfahre das gesetz eine evaluierung und damit wahrscheinlich eine novellierung prophezeit die anwaeltin verheyen tamara zieschang staatssekretaerin aus dem schleswig holsteinischen wirtschaftsministerium hatte kuerzlich auf einer versammlung der buergerinitiative gegen co2 speicherung in nordfriesland bekraeftigt dass es mit der laenderklausel gelingen werde die im hohen norden auf breite ablehnung stossenden plaene zu verhindern insbesondere sorgen sich landwirte und trinkwasserverbaende die sich von vornherein nicht auf moegliche gefahren einlassen wollen und mit den zwischen und endlagerunzulaenglichkeiten rund um den atommuell in der asse und gorleben genuegend anschauungsunterricht geliefert bekommen abgesehen von der sicherheitsphilosophie steht das gesetz und hier besonders die laenderklausel aus sicht der ccs gegner auch rechtlich auf wackeligen fuessen es handele sich geradezu um eine arbeitsbeschaffungsmassnahme fuer juristen so verheyen die sich nicht vorstellen kann dass energiekonzerne sich gaenzlich von ihren ccs vorhaben abbringen lassen erst recht nicht vor dem hintergrund von 21 geplanten oder noch in bau befindlichen kohlekraftwerken und der wende in der energiepolitik dass bundesrechtliche kompetenz in solch einem politikfeld nun ausgerechnet auf die laenderebene heruntergereicht werde stelle ein novum dar betont verheyen eine gesetzliche verankerung des sankt florians prinzips und damit eine ungleichgewichtung einer lastenteilung sei unter umstaenden verfassungsrechtlich angreifbar fuehrt verheyen aus da mit der laenderklausel weder co2 pipelines und transportwege noch die abscheidung selbst verhindert werden sei die vermeintliche teilblockade die womoeglich gerichtlich wieder kassiert werden koenne irrefuehrend klueger sei zur vorgeschriebenen umsetzung der eu richtlinie ein verbotsgesetz wie es oesterreich bereits verabschiedet hat versichert loeffelsend um das zu erreichen kuendigte schleswig holsteins bund landesvorsitzende sybille macht baumgarten an der oertliche protest in form von montagsdemonstrationen werde nicht nachlassen ccs carbon capture and storage steht fuer die technologie der co2 abscheidung und speicherung im juli hatte der bundestag einen gesetzentwurf verabschiedet der diese technologie zu forschungszwecken erlaubt das gesetz geht auf eine eu richtlinie zurueck +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-strengerer url-pathpart-stresstest url-pathpart-fuer url-pathpart-banken url-pathpart-459120 url-domainname-www.wiwo.de url-domainprefix-www url-domainprefix-www.wiwo url-domaintype-de url-domainsuffix-wiwo.de strengerer stresstest fuer banken die neue eu bankenaufsicht eba will die finanzinstitute strengeren stresstests unterwerfen die kriterien des verschaerften verfahrens sind bislang offen doch immerhin steht jetzt der zeitplan fest fuer den 11 maerz hat die bundesbank die wichtigen deutschen banken vorgeladen um ueber die details des wieder bevorstehenden stresstests zu sprechen das erwaehnte wolfgang kirsch vorstandsvorsitzender der genossenschaftlich organisierten dz bank anlaesslich der bilanzpressekonferenz seines hauses in frankfurt kirsch vermutet dass die neue europaeische bankenaufsichtsbehoerde eba die anforderungen an die institute hochschrauben koennte laut eba chef andrea enria sieht es tatsaechlich danach aus wir muessen aus der vergangenheit lernen und zu einer strengeren und serioeseren ueberpruefung kommen sagte enria der nachrichtenagentur reuters mit der vergangenheit spielt der bankenaufseher auf den ersten europaeischen stresstest an der im vergangenen jahr stattfand damals waren nur sieben der 91 getesteten institute durchgefallen darunter die verstaatlichte deutsche immobilienbank hypo real estate irische banken dagegen die mit irland den zweiten eu mitgliedstaat nach griechenland in die krise gestuerzt hatten bestanden den test dieses manko will die seit januar 2011 fuer die europaeische bankenaufsicht zustaendige eba beim nun bevorstehenden zweiten testlauf beseitigen aufgabe der eba ist in kooperation mit nationalen kontrollbehoerden wie der bundesbank oder der bundesanstalt fuer finanzdienstleistungsaufsicht bafin fuer stabile finanzmaerkte zu sorgen die eba will ausserdem die finanzmarktregeln in der europaeischen union harmonisieren um gleiche bedingungen fuer alle europaeischen institute zu schaffen wie bereits im vergangenen jahr simulieren die kontrolleure im rahmen des stresstests ein wirtschaftliches negativszenario das die europaeische zentralbank entwickelt hat die simulation soll zeigen ob europas wichtigste banken fit genug sind um finanzschocks wie fallende immobilienpreise oder explodierende zinsen fuer staatsanleihen zu ueberstehen ziel ist zu beurteilen wie krisenresistent der finanzsektor ist erst am 18 maerz wird bekannt gegeben wie viele und welche finanzinstitute ueberprueft werden auch will die eba erst im april bekannt geben wie die testbedingungen genau aussehen doch schon morgen wollen die aufseher laut enria diese informationen an die fuer den stresstest ausgewaehlten banken weiterleiten das soll den betroffenen instituten die moeglichkeit fuer ein kritisches feedback geben im juni will die eba das ergebnis des stresstests bekannt geben strittig ist in welcher form die oeffentlichkeit und die spieler an den finanzmaerkten darueber zu informieren sind wie einzelne institute abgeschnitten haben der genossenschaftsbanker wolfgang kirsch bevorzugt eine regulierung im stillen denn schlechte testergebnisse duerften zur folge haben dass bedrohte banken keine privaten kapitalgeber mehr finden +Wirtschaft de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-207623 url-pagepartsplitname-207623 url-lastpartpagename-html boeings albtraum ist ausgeliefert irgendwie ist man geneigt die maschine mit einem trabant zu vergleichen erstens weil das ddr auto auch zum gutteil aus verbundwerkstoffen bestand und daher sehr leicht war zweitens weil kunden ewig und drei tage auf die auslieferung ihres traums warten mussten doch nun wird der erste der 55 von ana bestellten 787 langstreckenjets richtung japan abheben mit rund dreieinhalbjaehriger verspaetung im november wird die 787 zunaechst auf ana inlandslinien eingesetzt im dezember folgt dann die verbindung tokio peking bevor ana dann ab januar mit der 787 zwischen tokio haneda und frankfurt am main pendelt insgesamt sind derzeit 820 dreamliner deren rumpf und tragflaechen zum grossteil aus kohlefaserwerkstoff bestehen in boeings orderbuechern vermerkt das ist ein stattlicher auftrag die airlines erhoffen sich einen effektivitaetsschub von der maschine dass der typ so spaet kommt liegt unter anderem daran dass die produktion zum grossen teil ins ausland verlagert wurde gut ein drittel der bauteile des modells kommt aus japan von den zulieferfirmen mitsubishi kawasaki und juji heavy industries das boeing management war nicht in der lage die herstellung zu koordinieren und dabei technische schwierigkeiten die sich bei tests herausgestellt haben zu beheben die ersten drei exemplare musste der hersteller wegen unverkaeuflichkeit abschreiben so wurden aus veranschlagten sechs milliarden dollar entwicklungskosten rund 15 milliarden dollar und der dreamliner zum teuersten zivilflugzeugprogramm aller zeiten also zum albtraum der konzernbosse boeing dementiert nicht dass man erst um die tausend jets verkaufen muss bevor man in die gewinnzone fliegen kann auch bei der modernisierung ihres jumbos 747 hat boeing bislang nicht viel glueck die us frachtflug gesellschaft air atlas hat drei ihrer urspruenglich zwoelf georderten maschinen abbestellt als gruende werden mangelnde termintreue sowie leistungserwaegungen geltend gemacht unlaengst hatte die cargolux gesellschaft boeing duepiert weil sie die uebergabezeremonie platzen liess wegen technischer probleme hatte sich die erstauslieferung des neuen jumbos immer wieder verschoben man liegt rund zwei jahre hinter dem urspruenglichen zeitplan zurueck was die zuverlaessigkeit des 787 traum typs betrifft so kann lufthansa einiges dazu beitragen der technik betrieb der deutsche airline uebernimmt fuer einen weiteren japanischen 787 kunden die japan airlines wartung und service +Gesundheit de-DE url-pathpart-c url-pathpart-32201 url-pathpart-f url-pathpart-423986 url-pathpart-s url-pathpart-144fc263 url-pathpart-l url-pathpart-0L0Saerztezeitung0Bde0Cmedizin0Ckrankheiten0Cdiabetes0Carticle0C650A7360Ctyp0E20Ediabetes0Easthma0Erisiko0Everdoppelt0Bhtml url-domainname-rss.feedsportal.com url-domainprefix-rss url-domainprefix-rss.feedsportal url-domaintype-com url-domainsuffix-feedsportal.com url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm bei typ 2 diabetes ist das asthma risiko verdoppelt neu isenburg ikr es lohnt sich offenbar nicht nur bei asthmatikern verstaerkt auf hinweise fuer diabetes zu achten umgekehrt ist es auch wichtig bei typ 2 diabetikern das erhoehte asthmarisiko im blick zu haben denn dieses ist aktuellen studiendaten zufolge bei typ 2 diabetikern praktisch verdoppelt erst vor kurzem hat eine us amerikanische studie mit rund 2400 asthma patienten und 4 800 kontrollpersonen ohne die chronisch obstruktive atemwegserkrankung ergeben +Gesundheit de-DE url-pathpart-c url-pathpart-32191 url-pathpart-f url-pathpart-443314 url-pathpart-s url-pathpart-144f4210 url-pathpart-l url-pathpart-0L0Sfocus0Bde0Cgesundheit0Cratgeber0Csexualitaet0Cnews0Cattraktivitaet0Ewie0Emaenner0Efrauen0Eum0Eden0Efinger0Ewickeln0Iaid0I6198990Bhtml url-domainname-rss2.focus.de url-domainprefix-rss2 url-domainprefix-rss2.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm attraktivitaet wie maenner frauen um den finger wickeln das laengenverhaeltnis von zeige und ringfinger eines mannes ist aussagekraeftig +Gesundheit de-DE url-pathpart-c url-pathpart-32201 url-pathpart-f url-pathpart-423986 url-pathpart-s url-pathpart-144c9d77 url-pathpart-l url-pathpart-0L0Saerztezeitung0Bde0Cpraxis0Iwirtschaft0Cpersonalfuehurng0Carticle0C650A740A0Cdeutsche0Ekein0Eproblem0Eteilzeit0Echefinnen0Bhtml url-domainname-rss.feedsportal.com url-domainprefix-rss url-domainprefix-rss.feedsportal url-domaintype-com url-domainsuffix-feedsportal.com url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm deutsche haben kein problem mit teilzeit chefinnen muenchen reh gut drei viertel der deutschen koennen sich vorstellen unter einer vorgesetzten mit teilzeitvertrag zu arbeiten das zeigt eine repraesentative online umfrage unter 1000 buergern der unternehmensberatung rochus mummert die angst vieler unternehmen dass teilzeit chefinnen autoritaetsprobleme drohten sei also unbegruendet so das beratungsunternehmen noch sei aber nicht einmal jede vierte fuehrungsposition im mittleren management mit einer frau besetzt dabei haetten frauen als leitende teilzeitangestellte die chance nicht nur beruf und familie sondern auch karriere und familie miteinander zu verbinden das fachliche und persoenliche wissen eine fuehrungsposition zu uebernehmen trauen die bundesbuerger den frauen laut der umfrage allemal zu fuer zwei von drei befragten macht es keinen unterschied ob sie von einer frau oder einem mann gefuehrt werden unternehmen die fuer qualifizierte frauen als arbeitgeber attraktiv sein moechten sollten sich beeilen fuehrungskultur und fuehrungsprozesse auf teilzeitmodelle auszurichten sagt bernhard walter senior berater bei rochus mummert dabei geht es nicht um die erfuellung wie und von wem auch immer berechneter frauenquoten sondern um das ureigene interesse der firmen im kampf um die besten koepfe auch auf alle koepfe zugreifen zu koennen die definition von arbeitgeberattraktivitaet hoert in diesem zusammenhang eben nicht beim betriebskindergarten auf sondern umfasst genauso das aufzeigen klarer karriereperspektiven +Gesundheit de-DE url-pathpart-news url-domainname-www.n24.de url-domainprefix-www url-domainprefix-www.n24 url-domaintype-de url-domainsuffix-n24.de url-firstpartpagename-newsitem_7180047 url-pagepartsplitname-newsitem url-pagepartsplitname-7180047 url-lastpartpagename-html fernsehen als einschlafhilfe wer unter schlafstoerungen leidet laesst kaum etwas unversucht um abends besser zur ruhe zu kommen wie eine repraesentative umfrage im auftrag des gesundheitsmagazins apotheken umschau herausfand ist die gaengigste methode der betroffenen das schlichte abwarten +Gesundheit de-DE url-pathpart-politik url-pathpart-gesellschaft url-pathpart-berufspolitik url-pathpart-article url-pathpart-672701 url-domainname-www.aerztezeitung.de url-domainprefix-www url-domainprefix-www.aerztezeitung url-domaintype-de url-domainsuffix-aerztezeitung.de url-firstpartpagename-kbv-chef-koehler-fuerchtet-blockade-spezialaerztlichen-versorgung url-pagepartsplitname-kbv url-pagepartsplitname-chef url-pagepartsplitname-koehler url-pagepartsplitname-fuerchtet url-pagepartsplitname-blockade url-pagepartsplitname-spezialaerztlichen url-pagepartsplitname-versorgung url-lastpartpagename-html kbv chef koehler fuerchtet blockade der spezialaerztlichen versorgung das versorgungsgesetz eroeffnet kven viele handlungsmoeglichkeiten eindringlich mahnt kbv chef dr andreas koehler jetzt zu liefern sein vorstandskollege dr carl heinz mueller fordert erneut die abschaffung der richtgroessen regresse berlin im vorfeld der parlamentarischen beratungen des gkv versorgungsstrukturgesetzes hat kbv chef andreas koehler am freitag bei der internen vertreterversammlung seiner organisation die kven ermahnt die chancen dieses gesetzes zu nutzen diese regierung beschert uns ein liberales gesetz es schafft neue instrumente aber wir muessen sie auch nutzen wollen sagte koehler bereits fuer oktober kuendigte er ein neues gesamtkonzept fuer die bedarfsplanung an hier liegt der ball klar im feld der aerzteschaft die freiheiten die die kven bei der bedarfsplanung erhalten bedeuteten auch gewohnheiten auf den pruefstand zu stellen die ermittlung von verhaeltniszahlen je landkreis oder kreisfreier stadt auf der grundlage der bedarfsverteilung von 1990 sei nicht mehr zeitgemaess je nach fachgruppe muessten differenzierte bedarfsplanungsregionen jenseits von land und stadtkreisen gefunden werden entschieden werden muesse die frage ob auch die ausschliesslich auftragnehmenden fachaerzte in die bedarfsplanung einbezogen werden sollen ein besonderes problem fuer die bedarfsplanung koennten die psychotherapeuten werden angesichts der demografischen entwicklung muesse die zahl der notwendigen sitze dramatisch erhoeht werden das werde aber nur funktionieren wenn die psychotherapie ausserhalb der morbiditaetsbedingten gesamtverguetung finanziert werde diese fachgruppe sei ein gutes beispiel dafuer dass eine am tatsaechlichen versorgungsbedarf orientierte planung sinnvoll sei um wartezeiten auf einen therapieplatz zu verkuerzen koehler sorgt sich um realisierung der spezialaerztlichen versorgung mit sorge sieht koehler dass aufgrund der intervention der laender die spezialaerztliche versorgung nicht realisiert wird zwar seien erhebliche korrekturen am entwurf von paragraf 116 b noetig +Gesundheit de-DE url-pathpart-medizin url-pathpart-krankheiten url-pathpart-demenz url-pathpart-article url-pathpart-673819 url-domainname-www.aerztezeitung.de url-domainprefix-www url-domainprefix-www.aerztezeitung url-domaintype-de url-domainsuffix-aerztezeitung.de url-firstpartpagename-wenn-menschen-teuren-anzug-stundenlang-duschen url-pagepartsplitname-wenn url-pagepartsplitname-menschen url-pagepartsplitname-teuren url-pagepartsplitname-anzug url-pagepartsplitname-stundenlang url-pagepartsplitname-duschen url-lastpartpagename-html wenn menschen im teuren anzug stundenlang duschen ein mann in anzug unter der dusche +Gesundheit de-DE url-pathpart-politik url-pathpart-gesellschaft url-pathpart-article url-pathpart-673507 url-domainname-www.aerztezeitung.de url-domainprefix-www url-domainprefix-www.aerztezeitung url-domaintype-de url-domainsuffix-aerztezeitung.de url-firstpartpagename-spd-setzt-haeusliche-pflege-rund-uhr url-pagepartsplitname-spd url-pagepartsplitname-setzt url-pagepartsplitname-haeusliche url-pagepartsplitname-pflege url-pagepartsplitname-rund url-pagepartsplitname-uhr url-lastpartpagename-html spd setzt auf haeusliche pflege rund um die uhr die spd bundestagsfraktion hat eine pflege wunschliste erarbeitet +Gesundheit de-DE url-pathpart-medizin url-pathpart-krankheiten url-pathpart-neuro url-pathpart-psychiatrische url-pathpart-krankheiten url-pathpart-schlafstoerungen url-pathpart-article url-pathpart-672538 url-domainname-www.aerztezeitung.de url-domainprefix-www url-domainprefix-www.aerztezeitung url-domaintype-de url-domainsuffix-aerztezeitung.de url-firstpartpagename-dopamin-agonisten-bringen-quaelend-rastlose-beine-ruhe url-pagepartsplitname-dopamin url-pagepartsplitname-agonisten url-pagepartsplitname-bringen url-pagepartsplitname-quaelend url-pagepartsplitname-rastlose url-pagepartsplitname-beine url-pagepartsplitname-ruhe url-lastpartpagename-html dopamin agonisten bringen quaelend rastlose beine zur ruhe das restless legs syndrom rls laesst sich mit dopaminergen arzneien inzwischen gut behandeln auch eine augmentation ist weitgehend vermeidbar allerdings wird es immer noch zu selten erkannt wiesbaden es zieht kribbelt und brennt in den beinen man findet kaum noch schlaf +Deutschland de-DE url-pathpart-c url-pathpart-33356 url-pathpart-f url-pathpart-566375 url-pathpart-s url-pathpart-1782004b url-pathpart-l url-pathpart-0L0Stagesspiegel0Bde0Cberlin0Cpolizei0Ejustiz0Ckinderwagen0Ebrandstiftungen0Everdaechtiger0Efestgenommen0C451520A80Bhtml url-domainname-tagesspiegel.feedsportal.com url-domainprefix-tagesspiegel url-domainprefix-tagesspiegel.feedsportal url-domaintype-com url-domainsuffix-feedsportal.com url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm kinderwagen brandstiftungen update in prenzlauer berg hat die polizei am fruehen freitagmorgen einen 29 jaehrigen festgenommen er soll in einem wohnhaus in der winsstrasse zwei kinderwagen in brand gesteckt haben wenn sie diese felder durch einen klick aktivieren werden informationen an facebook twitter oder google in die usa uebertragen und unter umstaenden auch dort gespeichert bei dem festgenommenen handele es sich um einen zeitungsaustraeger aus neukoelln teilte die polizei mit der 29 jaehrige hatte bereits die aufmerksamkeit der ermittler auf sich gezogen und wurde deshalb in der vergangenen nacht beobachtet nachdem er seine zeitungslieferung entgegengenommen hatte laut polizeiangaben folgten ihm zivilpolizisten waehrend er seine route durch den kiez absolvierte jeder hausflur wurde von den zivilpolizisten kontrolliert in einem haus in der winsstrasse hielt sich der beobachtete laengere zeit auf nachdem er das gebaeude wieder verlassen hatte und in die heinrich roller strasse abgebogen war entdeckten die beamten die brennenden kinderwagen informierten die kollegen und versuchten mit feuerloeschern den brand zu ersticken in der heinrich roller strasse klickten dann kurz darauf die handschellen der festgenommene hatte ein einwegfeuerzeug dabei er wurde nach einer erkennungsdienstlichen behandlung den ermittlern des zustaendigen brandkommissariates des landeskriminalamtes ueberstellt inwieweit der mann auch fuer brandlegungen an kinderwagen in der vergangenheit in frage kommt ist gegenstand der laufenden ermittlungen am nachmittag lief noch die vernehmung des mannes durch die ermittler vom lka spaeter soll entschieden werden ob ein haftbefehl beantragt wird fuer wie viele braende der 29 jaehrige insgesamt im verdacht steht wollte die polizei am freitag noch nicht sagen als zeitungszusteller hatte der mann zu dutzenden haeusern im bezirk zugang in der gegend hatte es immer wieder trotz verschlossener tueren gebrannt wie die ermittler auf den verdaechtigen aufmerksam wurden wollten sie aus ermittlungstaktischen gruenden nicht verraten sicher ist nur dass es die erste nacht war in der zivilpolizisten den verdaechtigen auf schritt und tritt ueberwacht und auch gleich ertappt haben haeufig verraten brandstifter sich selbst indem sie am tatort bleiben und vorgeben beim loeschen zu helfen oder sie kehren zurueck um sich ihr werk anzusehen die polizei ist daher speziell geschult auf verdaechtige schaulustige und besonders engagierte helfer zu achten inwieweit der mann auch fuer brandlegungen an kinderwagen in der vergangenheit in frage kommt ist gegenstand der laufenden ermittlungen in berlin kommt es seit monaten regelmaessig zu braenden in hausfluren nach polizeiangaben der polizei gab es zwischen januar und mai bereits 163 braende in hausfluren bei denen zum teil mieter rauchvergiftungen erlitten und hoher sachschaden entstand im vergleichszeitraum des vorjahres waren es insgesamt 144 brandstiftungen wenn sie diese felder durch einen klick aktivieren werden informationen an facebook twitter oder google in die usa uebertragen und unter umstaenden auch dort gespeichert +Deutschland de-DE url-pathpart-c url-pathpart-33356 url-pathpart-f url-pathpart-566375 url-pathpart-s url-pathpart-175ec8be url-pathpart-l url-pathpart-0L0Stagesspiegel0Bde0Cberlin0Cpolizei0Ejustiz0C580Ejaehriger0Edurch0Esprengsatz0Ein0Epark0Everletzt0C4498170A0Bhtml url-domainname-tagesspiegel.feedsportal.com url-domainprefix-tagesspiegel url-domainprefix-tagesspiegel.feedsportal url-domaintype-com url-domainsuffix-feedsportal.com url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm 58 jaehriger durch sprengsatz in park verletzt update bei der explosion eines sprengsatzes in einem park im wedding ist am sonntag ein 58 jaehriger verletzt worden im wedding wurden schon zuvor rohrbomben gefunden wenn sie diese felder durch einen klick aktivieren werden informationen an facebook twitter oder google in die usa uebertragen und unter umstaenden auch dort gespeichert ein spaziergaenger ist am sonntagnachmittag in wedding von einer bombe schwer verletzt worden die in einer plastiktuete versteckt war der 58 jaehrige aus dem gleichen bezirk hatte seinen hund im schillerpark nahe der edinburger strasse ausgefuehrt als er eine neben einer parkbank herumliegende plastiktuete aufheben wollte kam es zur explosion der mann erlitt schwere verletzungen im gesicht und an den beinen ersten erkenntnissen zufolge handelte es sich bei dem sprengsatz um eine rohrbombe ein aehnlicher fall bei dem aber niemand verletzt wurde ereignete sich bereits im mai dieses jahres gleichfalls in wedding die polizei sperrte den schillerpark ueber stunden komplett ab und durchsuchte ihn dabei wurde jedoch nichts gefaehrliches mehr gefunden erschwert war die spurensicherung durch starke regenfaelle der schwerverletzte wurde von einem notarztwagen ins weddinger virchowklinikum gebracht der mann schwebt nicht in lebensgefahr bei der such nach dem bombenbauer tappen die ermittler weiterhin im dunkeln hiess es am montag von einem polizeisprecher die polizei warnte am sonntagabend davor verdaechtige gegenstaende vor allem in tueten aufzuheben im verdachtsfalle solle die polizei alarmiert werden am sonntagabend suchten sprengstoffspuerhunde im schillerpark nach gegenstaenden die moeglicherweise mit in der plastiktuete lagen und durch die explosion weggeschleudert worden waren der fuer sprengstoffdelikte zustaendige staatsschutz bei der polizei hat die ermittlungen uebernommen die experten sehen einen zusammenhang mit einem aehnlichen vorfall im vergangenen mai am 26 mai war ebenfalls in wedding eine rohrbombe in einer plastiktuete gefunden worden eine passantin hatte den verdaechtigen und mit kabeln versehenen gegenstand an der boeschung des nordufers entdeckt und die polizei gerufen die beamten konnten die bombe damals entschaerfen am tag danach hatte das praesidium mitgeteilt dass die eigenkonstruktion sprengfaehig gewesen sei die explosionswirkung habe jedoch nur fuer die unmittelbare umgebung ausgereicht also fuer denjenigen der die selbstbaubombe hochhebt fuer die auf der anderen seite des spandauer schifffahrtskanals gelegene auslaenderbehoerde habe keine gefahr bestanden hiess es die in der linken szene umstrittene behoerde war damals als moegliches ziel genannt worden spaetestens seit dem anschlag im schillerpark ist zu vermuten dass die rohrbomben von einem taeter wahllos abgelegt werden denn in oder am schillerpark ist laut polizei kein politisches motiv zu erkennen offensichtlich hat der bombenbastler sogar seine faehigkeiten weiterentwickelt und perfektioniert auch in den jahren davor hatte es bereits mehrere rohrbombenalarme in wedding gegeben im juni 2007 und juni 2008 waren jeweils in gruenanlagen am dohnagestell reste von rohrbomben gefunden worden diese waren bereits zur explosion gebracht worden durch wen und wieso konnte die polizei nie ermitteln auch diese beiden fundorte liegen ganz in der naehe von jenen in diesem jahr ermittler des lka pruefen nun ob es sich moeglicherweise in all diesen faellen um den gleichen taeter handelt erleichtert wird dies weil die nicht detonierte bastelbombe vom nordufer quasi als vergleichsstueck dienen kann im november 2002 hatten zwei lehrlinge eine selbstgebaute rohrbombe auf einen bvg bus geworfen niemand wurde verletzt bei ihrer festnahme gestanden sie den sprengsatz in ihrer ausbildungswerkstatt hergestellt zu haben ein zusammenhang mit den jetzigen faellen gilt aber als unwahrscheinlich die beiden lehrlinge erhielten bewaehrungsstrafen von sechs und acht monaten wenn sie diese felder durch einen klick aktivieren werden informationen an facebook twitter oder google in die usa uebertragen und unter umstaenden auch dort gespeichert +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,780177,00 url-pagepartsplitname-0,1518,780177,00 url-lastpartpagename-html#ref=rss militaer chinas erster flugzeugtraeger hat seinen heimathafen dalian im nordosten des landes wieder erreicht nach fuenf tagen hat er die testfahrt im pazifik abschlossen der betrieb des in der ukraine gekauften 300 meter langen schiffs beunruhigt die nachbarlaender und die usa peking der erste chinesische flugzeugtraeger hat am sonntag eine fuenftaegige testfahrt abgeschlossen wie die amtliche nachrichtenagentur xinhua berichtete kehrte die warjag in die hafenstadt dalian im nordosten des landes zurueck ein kleiner teil des seegebiets vor dalian war bis zum abend fuer andere schiffe gesperrt die regierung in peking hatte 1998 der ukraine das stillgelegte schiff ohne motoren sowie waffen und navigationssysteme abgekauft und fuer eine generalueberholung in den hafen von dalian geschleppt seither wurde es umgeruestet nach chinesischen angaben soll der flugzeugtraeger forschungs und uebungszwecken dienen nachbarlaender beobachteten die testfahrt jedoch angesichts chinesischer seemachtsambitionen mit sorge +Ausland de-DE url-pathpart-US url-pathpart-Schuldenkrise url-pathpart-!75381 url-domainname-www.taz.de url-domainprefix-www url-domainprefix-www.taz url-domaintype-de url-domainsuffix-taz.de us schuldenkrise die republikaner verschieben eine abstimmung und die linke protestiert doch ueber obama sprechen nur wenige seine art zu verhandeln missfaellt vielen von dorothea hahn hoert auf die weisheit des heiligen wortes ruft welton gaddy am donnerstagmittag in eine runde aus linken demokratinnen gewerkschafterinnen und frauengruppen sie haben sich direkt vor dem kapitol in washington versammelt um im allerletzten moment zu verhindern dass die sozialversicherung und die beiden staatlichen krankenversicherungen fuer beduerftige gekuerzt werden auf ihren transparenten steht +Unterhaltung de-DE url-pathpart-vermischtes url-pathpart-article1797547 url-domainname-www.morgenpost.de url-domainprefix-www url-domainprefix-www.morgenpost url-domaintype-de url-domainsuffix-morgenpost.de url-firstpartpagename-Kerner-will-nicht-mehr-Kerner-machen url-pagepartsplitname-Kerner url-pagepartsplitname-will url-pagepartsplitname-nicht url-pagepartsplitname-mehr url-pagepartsplitname-Kerner url-pagepartsplitname-machen url-lastpartpagename-html sat 1 zuerst hat er getraeumt dann hat er gebangt schliesslich wieder gehofft doch nun hat johannes b kerner seinen mitarbeitern am dienstag verkuendet dass er zum jahresende sein magazin kerner bei sat 1 beenden werde ja es stimmt johannes b kerner wird am 15 dezember in form eines jahresrueckblicks zum letzten mal sein magazin praesentieren sagte eine sprecherin des muenchner privatsenders mit grossen erwartungen war der damals 44 jaehrige hoffnungstraeger der frueher schon bei sat 1 aktiv war im jahr 2009 vom zdf zu seinem alten sender gewechselt mit ihm wuerden auch die vielen stamm zuschauer die vorher sein mehrfach die woche ausgestrahltes magazin im zdf gesehen hatten abwandern hatte der gebuertige bonner insgeheim gedacht doch die spekulationen erfuellten sich nicht der zdf zuschauer wechselt nicht einfach zu sat 1 auch nicht wegen kerner nach seinem start am montagabend fuhr die sendung quoten im einstelligen marktanteilsbereich ein und das beim fuer sat 1 so wichtigen publikum zwischen 14 und 49 jahren zu wenig fuer einen kommerziell arbeitenden programmveranstalter so bekam kerners magazin am donnerstag eine neue chance denn der moderator war ein zu wertvolles gesamtgut fuer den sender um ihn sofort aus dem programm zu verbannen als tv produzent und fussballexperte mit vielen kontakten in die sportbranche spielt er im fernsehen eine gewichtige rolle daran aenderte auch der geschaeftsfuehrerwechsel bei sat 1 nichts guido bolten der kerner 2009 noch verpflichtet hatte wurde anfang 2010 von andreas bartl abgeloest auf dem neuen sendeplatz donnerstagabends meist um 22 15 uhr erholte sich die sendung allmaehlich die marktanteile stiessen in den unteren zweistelligen marktanteilsbereich doch vor wenigen wochen verkuendete der moderator im branchendienst dwdl de vorsichtshalber schon mal +Unterhaltung de-DE url-pathpart-aktuelles url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-aura-dione-nackt-neuem-albumcover-1432280 url-pagepartsplitname-aura url-pagepartsplitname-dione url-pagepartsplitname-nackt url-pagepartsplitname-neuem url-pagepartsplitname-albumcover url-pagepartsplitname-1432280 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss aura dione nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Unterhaltung de-DE url-pathpart-kultur url-pathpart-article1777949 url-domainname-www.morgenpost.de url-domainprefix-www url-domainprefix-www.morgenpost url-domaintype-de url-domainsuffix-morgenpost.de url-firstpartpagename-Roman-Polanski-fuer-Lebenswerk-geehrt url-pagepartsplitname-Roman url-pagepartsplitname-Polanski url-pagepartsplitname-fuer url-pagepartsplitname-Lebenswerk url-pagepartsplitname-geehrt url-lastpartpagename-html filmfestival kult regisseur roman polanski hat beim zurich film festival den preis fuer sein lebenswerk entgegengenommen zwei jahre spaeter als geplant zwei jahre spaeter als geplant hat der polnisch franzoesische starregisseur roman polanski am dienstagabend in zuerich den preis fuer sein lebenswerk entgegengenommen die auszeichnung war ihm vom zurich film festival bereits 2009 zugesprochen worden doch als er vor zwei jahren am 26 september in die schweiz einreisen wollte wurde er in zuerich verhaftet grund war ein internationaler haftbefehl der usa diese forderten die auslieferung polanskis weil der regisseur 1977 eine minderjaehrige missbraucht haben soll polanski wurde in seinem chalet in gstaad unter hausarrest gestellt im juli 2010 wurde er freigelassen die schweiz lehnte seine auslieferung ab die behoerden hatten zweifel an der darstellung des sachverhalts durch die usa die us behoerden legen dem polnisch franzoesischen oscarpreistraeger zur last 1977 eine 13 jaehrige vergewaltigt zu haben polanski hatte seinerzeit eingeraeumt das maedchen mit champagner und drogen verfuehrt zu haben als ihm aber seiner ansicht nach absprachewidrig eine laengere haftstrafe drohte floh er 1978 aus den usa und kehrte seither nie wieder zurueck mit zwei jahren verspaetung sollte der 78 jaehrige nun am dienstagabend das goldene auge des zuercher filmfestivals entgegennehmen besser spaet als nie sagte er in einem interview mit dem franzoesischsprachigen schweizer fernsehen er sei nach wie vor sehr gerne in der schweiz die festivalleitung hatte sich ueber die zusage polanskis sehr gefreut wir sind sehr stolz und geehrt roman polanski nun endlich in zuerich empfangen zu koennen teilte sie mit auch den oscar mit dem er 2003 fuer der pianist als bester regisseur ausgezeichnet wurde wagte er aus angst vor verhaftung nicht selbst entgegenzunehmen als sein film der ghostwriter auf der berlinale 2010 uraufgefuehrt wurde sass er noch in gstaad im hausarrest zur premiere von der gott des gemetzels in venedig reiste er nicht an an seinem wohnsitz paris war er als franzoesischer staatsbuerger vor einer auslieferung sicher polanski schuf bis zu seinem 35 lebensjahr bereits vier filme die ihm allesamt hoechste internationale anerkennung einbrachten und klassiker der leinwand sind danach ueberwand er berufliche wie auch private tiefschlaege und errang 2003 mit dem oscar fuer das holocaust drama der pianist seinen groessten triumph es war der spaete doch laengst ueberfaellige hoehepunkt einer karriere die polanski einen herausragenden platz in der filmgeschichte sichert +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-kurt-kroemer-darum-brauche-keinen-fuehrerschein-1456312 url-pagepartsplitname-kurt url-pagepartsplitname-kroemer url-pagepartsplitname-darum url-pagepartsplitname-brauche url-pagepartsplitname-keinen url-pagepartsplitname-fuehrerschein url-pagepartsplitname-1456312 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss kurt kroemer der komiker kurt kroemer hat keine lust auto zu fahren kurt kroemer hat keinen fuehrerschein sagte der 36 jaehrige der nachrichtenagentur dpa ich vermisse ihn auch nicht wird er bei freunden und bekannten im auto mitgenommen werde fast immer geflucht immer da wo man gerade auto fahren will ist stau sagte kroemer da laufe ich lieber vor seinen auftritten gehe er in der stadt in der gastiere ausgiebig spazieren da sehe und erlebe ich die tollsten sachen sagte kroemer diese kommen dann auf die buehne die geschichten von der strasse sind meist die groessten brueller sagte er vor dem start seiner deutschlandtournee am freitag in magdeburg +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-keine-strafe-skandal-rocker-pete-doherty-1430824 url-pagepartsplitname-keine url-pagepartsplitname-strafe url-pagepartsplitname-skandal url-pagepartsplitname-rocker url-pagepartsplitname-pete url-pagepartsplitname-doherty url-pagepartsplitname-1430824 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss keine strafe fuer skandal rocker pete doherty nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Sport de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-204208 url-pagepartsplitname-204208 url-lastpartpagename-html baldriantropfen fuer den calcio nicht der gesamte italienische fussball ist korrupt es handelt sich nur um einzelne schwarze schafe in der kickerherde zu dieser besaenftigenden schlussfolgerung verleitet der urteilsspruch der ersten instanz der disziplinarkommission des italienischen fussballverbandes figc sie bestaetigte am dienstag weitgehend die strafen die der anklaeger stefano palazzi zuvor gefordert hatte zur debatte stand allerdings nur ein kleiner teil der zwischenzeitlich von staatsanwaelten wettanbietern und medien als verdaechtig erachteten spiele danach wurden die wichtigsten drahtzieher der spielabsprachen und wettbetruegereien zu fuenf jahren sperre und einem folgenden lebenslangen betaetigungsverbot in allen der figc angeschlossenen ligen verurteilt dies betrifft u a den ex nationalspieler giuseppe signori den die staatsanwaelte aus cremona fuer den kopf der sogenannten bologneser gruppe halten zur gleichen strafe wurden der ex profi antonio bellavista der sportdirektor des drittligisten ravenna calcio giorgio buffone und der wettbuerobetreiber massimo erodiani verurteilt von noch aktiven profis traf es den zweitligaspieler vincenzo sommese ascoli sowie die drittliga akteure carlo gervasoni cremona und marco paoloni cremona und benevento deren machenschaften loesten den anfangsverdacht der staatsanwaltschaft cremona aus in der serie a wurden nur der kapitaen von aufsteiger atalanta bergamo cristiano doni 3 jahre 6 monate und dessen mannschaftsgefaehrte thomas manfredini 3 jahre belangt beide waren von ihrem verein bereits bei einem freundschaftsspiel am wochenende nicht mehr beruecksichtigt worden nicht deshalb sondern wegen einer urspruenglich fehlerhaften addierung wurde atalanta vom gericht mit einem minuspunkt weniger belegt statt mit geforderten minus sieben startet der aufsteiger nun mit minus sechs punkten in die am 27 august beginnende saison figc anklaeger palazzi vergab zwei minuspunkte pro anklagepunkt fuer vereinsangehoerige und einen zusatzpunkt bei besonderer schwere des vergehens andere vereine konnten sich ueber eine noch gewichtigere strafverkuerzung freuen drittligist benevento geht nur mit neun statt 14 minuspunkten in die neue saison cremoneses handicap betraegt sechs statt neun punkte bei zweitligist ascoli blieb es bei den geforderten sechs punkten abzug die drittligisten alessandria und ravenna muessen zwangsweise absteigen die urteile sind allerdings noch nicht endgueltig fuer kommenden dienstag ist die verhandlung der zweiten instanz geplant giuseppe signori kuendigte bereits berufung an ebenso die anwaelte von atalanta bergamo und us cremonese prozessbeobachter gehen von einem aehnlichen szenario wie bei der aufarbeitung des schiedsrichterbestechungsskandals im jahre 2006 aus dort wurden die strafen von instanz zu instanz milder einsicht in schuldhaftes verhalten ist im italienischen fussball nicht sonderlich verbreitet wie schon 2006 beklagten auch die aktuellen richter eine omerta ein schweigegeluebde im fussballmilieu das eine aufklaerung erschwere dass der gesamte sumpf nicht trockengelegt wurde zeigt sich indes daran dass die disziplinarkammer nur ueber jene 18 spiele befand die die staatsanwaltschaft cremona in einer ersten phase im visier hatte bei mindestens 30 spielen der italienischen ligen hatten wettanbieter jedoch unregelmaessigkeiten festgestellt das verfahren dient damit eher der beruhigung anstatt der aufklaerung +Sport de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-199706 url-pagepartsplitname-199706 url-lastpartpagename-html fussball pur aber professioneller vor rund zwei wochen hat thomas bastian beim finanziell krisengeschuettelten fussballklub sv babelsberg 03 die nachfolge des zurueckgetretenen praesidenten rainer sperr uebernommen ueber die immer noch angespannte situation und die schwierige aufgabe den verein zu stabilisieren sprach mit dem 51 jaehrigen kino inhaber der schon seit 2003 als vertreter der fans im aufsichtsrat sass peer wilhelms nd +Sport de-DE url-pathpart-sport url-pathpart-wintersport url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-eishockey-del-del-mannheim-und-eisbaeren-mit-muehevollen-siegen_aid_677052 url-pagepartsplitname-eishockey url-pagepartsplitname-del url-pagepartsplitname-del url-pagepartsplitname-mannheim url-pagepartsplitname-und url-pagepartsplitname-eisbaeren url-pagepartsplitname-mit url-pagepartsplitname-muehevollen url-pagepartsplitname-siegen url-pagepartsplitname-aid url-pagepartsplitname-677052 url-lastpartpagename-html eishockey del del spitzenreiter adler mannheim hat am 12 spieltag der deutschen eishockey liga die tabellenfuehrung durch ein muehevolles 4 +Sport de-DE url-pathpart-sport url-pathpart-fussball url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-dfl-bierwerbung-rauball-contra-zwanziger_aid_676706 url-pagepartsplitname-dfl url-pagepartsplitname-bierwerbung url-pagepartsplitname-rauball url-pagepartsplitname-contra url-pagepartsplitname-zwanziger url-pagepartsplitname-aid url-pagepartsplitname-676706 url-lastpartpagename-html dfl bierwerbung wenn der dfb beschliesst keine werbevertraege mit brauereien mehr abzuschliessen werden wir die dfl einen anderen weg gehen sagte rauball bei einem sport empfang der spd buergerschaftsfraktion im festsaal des hamburger rathauses zwanziger hatte die verhandlungen ueber eine vertragsverlaengerung mit dem nationalmannschaftssponsor bitburger ueber die euro 2012 hinaus gestoppt da der deutsche fussball bund dfb die neue aktion alkoholfrei sport geniessen des deutschen olympischen sportbundes dosb und der bundeszentrale fuer gesundheitliche aufklaerung bzga unterstuetzt hat zwanziger vorlaeufig sein veto gegen eine vertragsverlaengerung eingelegt wenn der verband zeitgleich mit seinem partner bitburger ueber eine vertragsverlaengerung verhandelt dann muss sehr genau ausgelotet werden ob und wie sich diese aktivitaet noch mit einer bierwerbung vertraegt sagte zwanziger leidtragende eines generellen alkoholwerbeverbots waere vor allem die bundesliga der rund 300 millionen euro verloren gingen wir muessen nicht paepstlicher sein als der papst sagte rauball am donnerstag +Sport de-DE url-pathpart-c url-pathpart-32191 url-pathpart-f url-pathpart-443319 url-pathpart-s url-pathpart-18d3c775 url-pathpart-l url-pathpart-0L0Sfocus0Bde0Csport0Cfussball0Cbundesliga10Cbundesliga0Ehsv0Eboss0Estevens0Everpflichtung0Enoch0Enicht0Efix0Iaid0I6685620Bhtml url-domainname-rss2.focus.de url-domainprefix-rss2 url-domainprefix-rss2.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm bundesliga hsv boss die chefetage des hsv will anfang kommender woche einen neuen trainer praesentieren hat aber eine bereits feststehende verpflichtung des niederlaenders huub stevens dementiert die chefetage des fussball bundesligisten hamburger sv will anfang kommender woche einen nachfolger fuer den entlassenen trainer michael oenning praesentieren hat aber eine bereits feststehende verpflichtung des niederlaenders huub stevens dementiert es gibt noch nichts zu vermelden es ist noch nichts fix sagte der hamburger vorstandsvorsitzende carl edgar jarchow nach dem ersten saisonsieg am freitagabend beim vfb stuttgart 2 +Sport de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-205901 url-pagepartsplitname-205901 url-lastpartpagename-html breite flotte duenne spitze fuer manche deutsche medienvertreter war die bis sonntag dauernde ruder wm im slowenischen bled schon am donnerstag vorbei da gewann der bedeutungsschwanger benamte deutschland achter sein 30 rennen und den dritten titel in serie das grossboot gern als flaggschiff des deutschen rudersports tituliert verdiente sich diese beim olympia debakel 2008 mit rang acht in peking verloren gegangene bezeichnung erst bei dieser wm wieder seit der olympiapleite hat das vom dortmunder ralf holtmeyer betreute prestigeobjekt kein rennen mehr verloren in der oeffentlichkeit wird der achter als synonym fuer den rudersport betrachtet und uebersehen dass er nur ein mosaikstein einer vielfaeltigen sportart ist in bled stehen 27 wettbewerbe auf dem programm 14 olympisch acht nichtolympisch und fuenf sogenannte adaptive events fuer behinderte von den 14 wiederum interessieren neben dem achter eventuell noch der einer marcel hacker und die doppelvierer natuerlich hat der achter jeden respekt verdient art und weise wie er unter regie des in wriezen geborenen steuermanns martin sauer die weltelite dominiert ist faszinierend gleichwohl stehen dem andere kaum nach beispiel +Sport de-DE url-pathpart-c url-pathpart-32191 url-pathpart-f url-pathpart-443319 url-pathpart-s url-pathpart-1835b8d7 url-pathpart-l url-pathpart-0L0Sfocus0Bde0Csport0Cmehrsport0Cmoderner0Efuenfkampf0Ewm0Efuenfkampf0Ewm0Edeutsche0Emaenner0Eim0Efinale0Echancenlos0Iaid0I6642680Bhtml url-domainname-rss2.focus.de url-domainprefix-rss2 url-domainprefix-rss2.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm moderner fuenfkampf wm fuenfkampf wm andrei moissejew hat sich bei der heim wm der modernen fuenfkaempfer in moskau die krone aufgesetzt das deutsche herren trio hatte hingegen mit der entscheidung nichts zu tun doppel olympiasieger delf borrmann berlin stefan koellner potsdam und alexander nobis berlin verpassten als 17 19 und 25 die top ten deutlich silber hinter dem ueberlegenen moissejew der 2004 in athen und 2008 in peking olympisches gold geholt hatte gewann sein russischer landsmann alexander lesun vor dem ungarn adam marosi weltmeister von 2009 +Sport de-DE url-pathpart-c url-pathpart-32191 url-pathpart-f url-pathpart-443319 url-pathpart-s url-pathpart-1782d53b url-pathpart-l url-pathpart-0L0Sfocus0Bde0Csport0Cfussball0Cbundesliga10Cbundesliga0Eschalke0Eentscheidung0Eim0Efall0Eral0Ebis0Ezum0Esonntag0Iaid0I6569510Bhtml url-domainname-rss2.focus.de url-domainprefix-rss2 url-domainprefix-rss2.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm bundesliga schalke ein klaerendes gespraech mit manager horst heldt werde noch vor dem spiel gegen mainz stattfinden sagte trainer ralf rangnick am freitag nach dem 0 +Sport de-DE url-pathpart-2011 url-pathpart-40 url-domainname-www.zeit.de url-domainprefix-www url-domainprefix-www.zeit url-domaintype-de url-domainsuffix-zeit.de url-firstpartpagename-Deutschlandkarte-Fussball url-pagepartsplitname-Deutschlandkarte url-pagepartsplitname-Fussball deutschlandkarte wer will kann belegen dass das interesse am frauenfussball seit der weltmeisterschaft im sommer enorm ja rasant gestiegen ist in der neuen bundesliga saison kamen zu den spielen des weiblichen fc bayern muenchen im schnitt 615 zuschauer statt 473 im vorigen jahr was einer steigerung von mehr als 30 prozent entspricht um die karte zu vergroessern klicken sie bitte hier andererseits verzeihung 615 zuschauer das schaffen die bayern maenner beim training der vergleich ist aber deswegen unfair weil die bayern frauen in der bundesliga nur mittelmass sind siebte von zwoelfen sind sie gerade die traditionell erfolgreichen frankfurterinnen duisburgerinnen und potsdamerinnen ziehen schon mehr zuschauer an es scheint zu helfen wenn es keine konkurrierenden erstliga maennermannschaft in der stadt gibt von einer sogwirkung durch die maenner ist also eher nicht auszugehen eine ausnahme ist der vfl wolfsburg vielleicht liegt es an dem was man dort am wochenende sonst so tun kann vielleicht ist es aber auch die emanzipierteste stadt des landes +Sport de-DE url-pathpart-sport url-pathpart-formel1 url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-formel-1-hamilton-schnappt-vettel-pole-position-weg_aid_674833 url-pagepartsplitname-formel url-pagepartsplitname-1 url-pagepartsplitname-hamilton url-pagepartsplitname-schnappt url-pagepartsplitname-vettel url-pagepartsplitname-pole url-pagepartsplitname-position url-pagepartsplitname-weg url-pagepartsplitname-aid url-pagepartsplitname-674833 url-lastpartpagename-html formel 1 hamilton schnappt vettel pole position weg mclaren pilot lewis hamilton hat sebastian vettel die pole position fuer den gp von suedkorea weggeschnappt vettel fehlten auf rang zwei etwas mehr als zwei zehntelsekunden jede serie endet einmal auch fuer sebastian vettel und sein red bull team das in suedkorea beim 16 rennen des jahres erstmals nicht die pole position holte in einem wieder einmal erneut spannenden qualifying stoppte der brite lewis hamilton im mclaren mercedes den bullen express und verwies den alten und neuen weltmeister vettel um etwas mehr als zwei zehntelsekunden auf platz zwei worueber sich der champion dennoch freute ich bin sehr gluecklich gerade weil es anfangs hier nicht so gut aussah mclaren war sehr stark aber auch wir waren im qualifying auf den punkt da sagte vettel und fuegte hinzu +Technik & Wissen de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-171875 url-pagepartsplitname-171875 url-lastpartpagename-html held mit schattenseiten kein zweifel er war sehr gefragt und so schrieb er +Technik & Wissen de-DE url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,768732,00 url-pagepartsplitname-0,1518,768732,00 url-lastpartpagename-html#ref=rss solarfoerderung die solarbranche kann vorerst aufatmen +Technik & Wissen de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-172364 url-pagepartsplitname-172364 url-lastpartpagename-html traumatische erlebnisse brennen sich ein nd +Technik & Wissen de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-169252 url-pagepartsplitname-169252 url-lastpartpagename-html ist da draussen jemand eine frage bewegt die menschheit seit langem +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-weltall url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,747514,00 url-pagepartsplitname-0,1518,747514,00 url-lastpartpagename-html#ref=rss space shuttle 40 000 menschen schauten vor ort zu 1100 journalisten waren versammelt +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-technik url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,740808,00 url-pagepartsplitname-0,1518,740808,00 url-lastpartpagename-html#ref=rss neckarwestheim i das atomkraftwerk neckarwestheim i ist der erste meiler der seinen weiterbetrieb der von schwarz gelb beschlossenen laufzeitverlaengerung verdankt doch ein neues gutachten moniert dass wichtige sicherheitsnachruestungen jahrelang verschleppt wurden lange sah es so aus als wuerde der erste block des kernkraftwerks neckarwestheim in diesen tagen zum technikmuseum werden nach dem von rot gruen beschlossenen atomausstieg waere der uralt meiler offizielle abkuerzung +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-technik url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,751970,00 url-pagepartsplitname-0,1518,751970,00 url-lastpartpagename-html#ref=rss akw katastrophe in japan zum ersten mal gibt es bilder die den zustand des akw fukushima aus naechster naehe zeigen +Technik & Wissen de-DE url-pathpart-~r url-pathpart-rp url-pathpart-online url-pathpart-rss url-pathpart-wissen url-pathpart-~3 url-pathpart-kAfWJpo4WuM url-domainname-feeds.rp-online.de url-domainprefix-feeds url-domainprefix-feeds.rp-online url-domaintype-de url-domainsuffix-rp-online.de url-firstpartpagename-1019185 url-pagepartsplitname-1019185 studie arten fliehen schneller vor klimawandel washington rpo viele tiere und pflanzen fliehen sehr viel schneller als gedacht vor dem klimawandel +Technik & Wissen de-DE url-pathpart-~r url-pathpart-rp url-pathpart-online url-pathpart-rss url-pathpart-wissen url-pathpart-~3 url-pathpart-PTSlL1qolho url-domainname-feeds.rp-online.de url-domainprefix-feeds url-domainprefix-feeds.rp-online url-domaintype-de url-domainsuffix-rp-online.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm was man wissen kann aber nicht muss warum man sich nicht selbst kitzeln kann duesseldorf rpo es gibt sachen die kann man wissen muss man aber nicht man koennte auch ohne sie leben interessant sind sie aber dennoch und als gespraechsstoff fuer die naechste party halten sie allemal her wir haben eine menge davon fuer sie gesammelt zum beispiel diese +Technik & Wissen de-DE url-pathpart-wissen url-domainname-www.sueddeutsche.de url-domainprefix-www url-domainprefix-www.sueddeutsche url-domaintype-de url-domainsuffix-sueddeutsche.de url-firstpartpagename-packeeis-schrumpft-auf-neues-minimum-gefaehrliches-tauwetter-im-ewigen-eis-1 url-pagepartsplitname-packeeis url-pagepartsplitname-schrumpft url-pagepartsplitname-auf url-pagepartsplitname-neues url-pagepartsplitname-minimum url-pagepartsplitname-gefaehrliches url-pagepartsplitname-tauwetter url-pagepartsplitname-im url-pagepartsplitname-ewigen url-pagepartsplitname-eis url-pagepartsplitname-1 url-lastpartpagename-1141322 packeis schrumpft auf neues minimum das meereis in der arktis wird in diesem sommer auf ein neues minimum zusammenschmelzen das aktuelle tempo des eisrueckgangs laesst darauf schliessen dass der bisherige negativrekord aus dem jahr 2007 unterboten wird teilte die universitaet bremen am freitag mit demnach ist die meereisflaeche am donnerstag auf 4 24 millionen quadratkilometer geschrumpft und lag damit unter dem entsprechenden tageswert des jahres 2007 damals waren 4 27 millionen quadratkilometer gemessen worden dass in diesem jahr ein negativrekord zu erwarten ist hatte sich bereits am anfang der woche abgezeichnet die eisdecke ist derzeit an den raendern so stark aufgebrochen dass die sonneneinstrahlung die oberste wasserschicht erwaermen kann und noch viele schollen schmelzen werden sagte ruediger gerdes meereisphysiker am bremerhavener alfred wegener institut am montag der rueckgang des sommerlichen eises betraegt seit 1972 bereits 50 prozent warnt der bremer umweltphysiker georg heygster dabei hat die packeisflaeche in diesem jahr offenbar eine andere form als in den vergangenen eisarmen perioden in diesem sommer gab es auffallend grosse eisfreie flaechen innerhalb der packeis zone zum beispiel in der laptev see im norden russlands dieses loch erweckt den eindruck als sei das eis hier von unten geschmolzen es tat sich anfang august auf und hat inzwischen die groesse hollands erreicht erklaerte der meereisforscher lars kaleschke vom klimacampus der universitaet hamburg vor wenigen tagen auch wissenschaftler des alfred wegener instituts die vor kurzem an bord des forschungseisbrechers polarstern bis zum nordpol vorgedrungen waren bestaetigten die geringe dicke des meereises messungen hatten einen durchschnittswert von 90 zentimetern ergeben im jahr 2001 war das meereis noch durchschnittlich zwei meter dick die arktis gilt unter klimaforschern als aeusserst sensible region die dortigen eisflaechen reagieren nicht nur empfindlich auf die erderwaermung sie bestimmen auch massgeblich deren voranschreiten so wie sich ein weisses auto unter sonnenstrahlung weniger aufheizt als ein dunkles fahrzeug reflektiert die eisflaeche der arktis mehr sonnenstrahlung zurueck in den weltraum als das vergleichsweise dunkle meerwasser schmilzt das packeis der arktis kurbelt das den klimawandel zusaetzlich an +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-weltall url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,792822,00 url-pagepartsplitname-0,1518,792822,00 url-lastpartpagename-html#ref=rss start des galileo satelliten es soll eine doppelpremiere werden erstmals hebt eine russische sojus rakete von europas weltraumbahnhof kourou ab und +Technik & Wissen de-DE url-pathpart-spiegel url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,791040,00 url-pagepartsplitname-0,1518,791040,00 url-lastpartpagename-html#ref=rss fliegen wie ein vogel kann der mensch bald fliegen wie ein vogel tueftler jagen dem menschheitstraum des schwingenflugs mit beweglichen fluegeln nach als die sonne blutrot ueber dem morgennebel in die hoehe steigt fliegt todd reichert davon emporgetragen wird er vom auf und ab maechtiger fluegel kein motor treibt sein vehikel an mit blosser kraft der beine die er in die pedale stemmt hebt er ab sein strampeln wird durch ein seilsystem auf die fluegel uebertragen monatelang hat der kanadische ingenieurstudent fuer diesen moment trainiert er nahm acht kilo ab und erhielt unterricht von einem kickboxer zusammen mit kommilitonen der university of toronto hatte er vier jahre lang das einzigartige superleichtflugzeug zusammengeschraubt der snowbird besteht grossenteils aus carbonstangen balsaholz und plastikfolie und wiegt nur 44 kilogramm hat aber eine spannweite von 32 metern fast so viel wie eine boeing 737 reicherts flug dauerte nur 19 3 sekunden nach 145 metern verliessen ihn die kraefte und er musste landen mit hilfe des kinderwagenrads das er unter den rumpf geschraubt hatte fuer den piloten war der huepfer ein triumph ein uralter traum vom fliegen ist wahr geworden sagt reichert die vision sich aus eigener kraft in die luefte zu erheben beschwor bereits der antike mythos von ikarus mit schwingen aus federn und wachs erhob dieser sich in die luft bis er der sonne zu nahe kam +Technik & Wissen de-DE url-pathpart-~r url-pathpart-rp url-pathpart-online url-pathpart-rss url-pathpart-wissen url-pathpart-~3 url-pathpart-ukOgqmbUdqU url-domainname-feeds.rp-online.de url-domainprefix-feeds url-domainprefix-feeds.rp-online url-domaintype-de url-domainsuffix-rp-online.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm bau gefaehrdet amazonas ureinwohner brasilianisches gericht stoppt riesenstaudamm sao paulo rpo ein brasilianisches gericht hat einen baustopp fuer den drittgroessten staudamm der welt im amazonas gebiet angeordnet der bau des umstrittenen belo monte staudamms gefaehrde den fischfang der ureinwohner hiess es am mittwoch zur begruendung das baukonsortium duerfe keine infrastrukturmassnahmen vornehmen welche den natuerlichen strom des xingu flusses und damit den fischbestand beeintraechtigten das bundesgericht des nordbrasilianischen staates para untersagte dem baukonsortium norte energia das flussbett des xingu eines zuflusses des amazonas durch den bau eines hafens oder von deichen durch sprengungen oder das graben von kanaelen zu veraendern bautaetigkeiten die keine auswirkungen auf die oertliche fischerei haben duerfen dem gerichtsurteil zufolge fortgesetzt werden wenn das baukonsortium gegen das urteil verstoesst muss es pro tag 200 000 real etwa 80 000 euro strafe zahlen gegen das elf milliarden dollar 8 1 milliarden euro teure projekt laufen indianische ureinwohner der region und umweltschuetzer seit langem sturm sie verweisen darauf dass den berechnungen zufolge eine flaeche von 500 quadratkilometern geflutet werden wird und 16 000 menschen umgesiedelt werden muessen in ihrem protest werden die ureinwohner von der katholischen kirche aber auch von prominenten wie dem kanadischen starregisseur james cameron avatar oder dem britischen rockmusiker sting unterstuetzt +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-technik url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,785831,00 url-pagepartsplitname-0,1518,785831,00 url-lastpartpagename-html#ref=rss atomunfall in suedfrankreich ein ofen fuer radioaktive abfaelle ist explodiert ein arbeiter gestorben doch die ursache fuer das unglueck in der atomanlage marcoule bleibt unklar trotzdem gaben die behoerden schnell entwarnung die franzoesischen medien berichten sehr gelassen nur die anwohner reagieren empoert paris die meldung schockierte die boersen und weckte erinnerungen an das unglueck in japan +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-weltall url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,783022,00 url-pagepartsplitname-0,1518,783022,00 url-lastpartpagename-html#ref=rss absturz eines raumtransporters der absturz eines russischen raumschiffes hat folgen fuer die internationale raumstation +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-technik url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,785765,00 url-pagepartsplitname-0,1518,785765,00 url-lastpartpagename-html#ref=rss kernenergie auf dem gelaende der atomanlage marcoule in suedfrankreich hat sich eine explosion ereignet ein mensch ist nach behoerdenangaben ums leben gekommen vier weitere wurden verletzt radioaktivitaet soll nicht in die umwelt gelangt sein paris wien in der suedfranzoesischen atomanlage marcoule gab es am montag eine explosion es habe einen toten gegeben teilte die franzoesische atomaufsicht asn mit vier weitere personen seien verletzt worden eine davon schwer strahlung ist laut asn und der oertlichen polizei nicht ausgetreten die behoerden richteten dennoch eine sicherheitszone ein falls noch radioaktivitaet entweichen sollte nach angaben der asn ist bei dem unfall ein verbrennungsofen fuer schwach radioaktive abfaelle explodiert dazu zaehlen unter anderem kleidung von arbeitern sowie metalle und beton die ursache des ungluecks war zunaechst unklar es handelt sich um einen industrieunfall nicht um einen atomunfall sagte ein sprecher des staatlichen stromkonzerns edf dessen tochterunternehmen socodei die anlage betreibt der bei dem unfall gestorbene arbeiter sei in sekundenschnelle verbrannt sagte ein edf sprecher zu spiegel online der mann habe in einem raum neben dem verbrennungsofen gearbeitet als dieser explodiert sei die asn hat den unfall am montagnachmittag offiziell fuer beendet erklaert dieser unfall bedeutet keine radioaktivitaet und keine massnahmen zum schutz der bevoelkerung teilte die asn mit die ihren krisenstab wieder aufloeste die explosion in einem verbrennungsofen habe ein feuer entfacht das gegen 13 uhr unter kontrolle gewesen sei das gebaeude in dem der ofen stand sei nicht beschaedigt worden die vier verletzten von denen einer schwere verbrennungen erlitt seien nicht verstrahlt worden auch ausserhalb des gebaeudes sei keine radioaktivitaet gemessen worden nun solle untersucht werden wie es zu dem unfall kam auch das unabhaengige franzoesische atomforschungsinstitut criirad hat entwarnung gegeben +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-article13595532 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Arktis-Eis-schmilzt-auf-Rekordminimum-zusammen url-pagepartsplitname-Arktis url-pagepartsplitname-Eis url-pagepartsplitname-schmilzt url-pagepartsplitname-auf url-pagepartsplitname-Rekordminimum url-pagepartsplitname-zusammen url-lastpartpagename-html klimawandel das eis im nordpolarmeer erreicht ein rekordminimum der tiefstand ist laut experten nicht mehr durch die natuerliche variabilitaet zu erklaeren das meereis ist in diesem sommer auf ein neues rekordminimum geschmolzen das selbst die bisher geringste eisausdehnung im nordpolarmeer aus dem jahr 2007 unterschreitet mit nur noch 4 24 millionen quadratkilometer ist das sommerliche eis seit 1971 um 50 prozent zurueckgegangen georg heygster vom institut fuer umweltphysik an der universitaet bremen zufolge ist diese negativmarke nicht mehr durch die natuerliche variabilitaet von jahr zu jahr zu erklaeren sondern zeigt nur umso deutlicher was fuer gravierende auswirkungen der klimawandel auf die arktis hat schon vor wochen hatten wissenschaftler diesen september mit dem tiefstand der eisbedeckung gerechnet jetzt wurden die zahlen von der universitaet in bremen vorgelegt die eisausdehnung schwankt nach heygsters angaben im jahresverlauf zwischen etwa 15 millionen quadratkilometern im maerz und 5 quadratkilometern im september der aktuelle wert laege allerdings diesen september um 0 6 prozent unter dem minimum von 2007 fuer kleinlebewesen die an der unterseite des eises leben und gleichzeitig ausgangspunkt der nahrungskette auch fuer uns menschen sind bleibt immer weniger lebensraum erlaeuterte der umweltphysiker der universitaet bremen weiter hinzu kommt dass auch in diesem jahr die nordost und nordwestpassagen wieder gleichzeitig eisfrei seien was erstmalig 2008 beobachtet wurde heygst aussagen zufolge zeigen mehrjaehrige beobachtungen dass die mittlere eisdecke abnimmt es bleibt abzuwarten wie sich die situation weiter entwickelt da der wert der eisausdehnung in den naechsten wochen weiter abnehmen kann +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-weltall url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,786411,00 url-pagepartsplitname-0,1518,786411,00 url-lastpartpagename-html#ref=rss space launch system die us raumfahrtbehoerde sucht den shuttle nachfolger nun ist die nasa fuendig geworden die neue riesenrakete space launch system soll astronauten kuenftig sogar bis zum mars bringen doch das kraftpaket kostet viel geld deswegen setzen die ingenieure auch auf technik von gestern berlin es soll ein kraftvoller befreiungsschlag werden +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-article13575103 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-9000-Saeuglinge-sterben-taeglich-Versorgung-fehlt url-pagepartsplitname-9000 url-pagepartsplitname-Saeuglinge url-pagepartsplitname-sterben url-pagepartsplitname-taeglich url-pagepartsplitname-Versorgung url-pagepartsplitname-fehlt url-lastpartpagename-html un schaetzungen rund 40 prozent aller todesfaelle von kindern unter fuenf jahren seien in den ersten 28 tagen nach der geburt zu verzeichnen erlaeuterte die who dabei ist die kritischste zeit die erste woche hauptursache fuer die saeuglingssterblichkeit sind unterernaehrung infektionen und sauerstoffmangel auch zu frueh geborene kinder haben in entwicklungslaendern geringere ueberlebenschancen weil es an medizinischer betreuung und geeigneten geraeten wie brutkaesten mangelt allerdings sank demnach die zahl der neugeborenen die in ihren ersten vier lebenswochen sterben zwischen 1990 und 2009 von schaetzungsweise 4 6 millionen auf 3 3 millionen als grund sieht die who die gestiegenen investitionen im gesundheitswesen grosse fortschritte gebe es in china in afrika sehe es dagegen schlecht aus in indien treten nach who angaben jaehrlich rund 900 000 todesfaelle bei neugeborenen auf das seien 28 prozent der gesamttodesfaelle in diesem alter weltweit es folgen nigeria und pakistan china liegt auf platz vier mit blick auf die gesamtzahl der todesfaelle bei neugeborenen dort ist der anteil der todesfaelle von 23 pro 1000 lebendgeborene im jahr 1990 auf 11 pro 1000 lebendgeborene im jahr 2009 zurueckgegangen tschechien und andere eu staaten haben sich deutlich verbessert da in afrika nur ein rueckgang von jaehrlich einem prozent zu verzeichnen sei duerfte es rein statistisch gesehen mehr als 150 jahre dauern bis es den level der usa oder grossbritanniens bei der neugeborenensterblichkeit erreicht habe heisst es in der mitteilung weiter von den zehn laendern die die sterberate bei neugeborenen um jeweils mindestens zwei drittel in zwei dekaden gesenkt haben gehoeren unter anderem zypern tschechien estland griechenland und luxemburg der kampf gegen die kindersterblichkeit gehoert zu den acht millenniumsentwicklungszielen der un verglichen mit 1990 soll die kindersterblichkeit bis 2015 um zwei drittel sinken die un millenniumsziele zu denen auch die halbierung der armut zaehlt wurden im jahr 2000 von den staats und regierungschefs beschlossen +Technik & Wissen de-DE url-pathpart-c url-pathpart-795 url-pathpart-f url-pathpart-448250 url-pathpart-s url-pathpart-143ff78b url-pathpart-l url-pathpart-0L0Ssueddeutsche0Bde0Cwissen0Carktis0Etauender0Epermafrostboden0Ewie0Eein0Eschweizer0Ekaese0E10B10A85930A url-domainname-rssfeed.sueddeutsche.de url-domainprefix-rssfeed url-domainprefix-rssfeed.sueddeutsche url-domaintype-de url-domainsuffix-sueddeutsche.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm arktis hans wolfgang hubberten ist leiter der aussenstelle potsdam des alfred wegener instituts fuer polar und meeresforschung awi seit fast 20 jahren erforscht er die dauerfrostboeden sibiriens hubberten +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-weltall url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,748765,00 url-pagepartsplitname-0,1518,748765,00 url-lastpartpagename-html#ref=rss letzter ausseneinsatz an der iss zwei astronauten der us raumfaehre discovery haben ihren zweiten ausseneinsatz an der iss erfolgreich abgeschlossen damit ging eine aera zu ende +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-weltall url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,764936,00 url-pagepartsplitname-0,1518,764936,00 url-lastpartpagename-html#ref=rss europas neuer astronautenchef strahlemann auf schwieriger mission +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-technik url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,656774,00 url-pagepartsplitname-0,1518,656774,00 url-lastpartpagename-html#ref=rss raketen fliegen mit eis und aluminium ein neuartiger raketen treibstoff koennte die raumfahrt revolutionieren das ueberraschend simple gemisch aus wasser und kleinsten aluminium partikeln soll umweltfreundlicher sicherer und ebenso leistungsstark wie herkoemmlicher sprit sein zudem koennte es den weg zum mars ebnen es ist jahrzehnte her seit die ersten raketen in den himmel gestiegen sind doch waehrend sich die flugkoerper selbst rasant weiterentwickelt haben sind die treibstoffe heute weitgehend dieselben wie noch vor mehr als 50 jahren jetzt aber koennte eine mixtur aus metall und eis einen entscheidenden fortschritt bringen er soll nach vorstellungen seiner entwickler raketenstarts nicht nur sauberer machen sondern koennte auch das nachtanken an weit entfernten zielen wie etwa dem mars ermoeglichen der treibstoff namens alice kurz fuer aluminium ice soll seine energie aus der chemischen reaktion zwischen wasser und aluminium erhalten zudem koennte der dabei entstehende wasserstoff auch noch fuer andere dinge gut sein etwa fuer den betrieb von brennstoffzellen auf langzeit raumfluegen forscher der purdue university in west lafayette us bundesstaat indiana haben im august bereits eine rund drei meter lange rakete mit hilfe des neuartigen treibstoffs in den himmel geschossen der flugkoerper erreichte nach angaben der universitaet eine hoehe von immerhin rund 400 metern damit haben wir bewiesen dass das konzept prinzipiell funktioniert sagte purdue professor steven son ihre ergebnisse haben die wissenschaftler unter anderem im fachblatt proceedings of the combustion institute veroeffentlicht der schluessel zur kraft von alice ist die geringe groesse der aluminium partikel solche teilchen werden bereits in treibstoffen eingesetzt etwa in den feststoff booster raketen des space shuttles oder in den neuen ares raketen der nasa doch die partikel sind meist wesentlich groesser als jene die in alice zum einsatz kommen die sind im durchmesser lediglich rund 80 nanometer klein verbrennen deshalb schneller und ermoeglichen eine bessere kontrolle ueber den rueckstoss so die purdue forscher die alu partikel werden dafuer mit wasser zu einer zaehen paste vermischt sie wird in einen zylinder mit einem stab in der mitte gefuellt und eingefroren ist die masse hart wird der stab entfernt so dass eine runde aushoehlung zurueckbleibt der feststoff wird dann mit hilfe eines kleinen raketentriebwerks an der spitze des zylinders gezuendet +Gesundheit de-DE url-pathpart-~r url-pathpart-rp url-pathpart-online url-pathpart-rss url-pathpart-gesundheit url-pathpart-~3 url-pathpart-SkdtvYgtg5c url-domainname-feeds.rp-online.de url-domainprefix-feeds url-domainprefix-feeds.rp-online url-domaintype-de url-domainsuffix-rp-online.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm brustschmerzen durch verspannten ruecken was haltungsschaeden ausloesen koennen frankfurt main rpo die sorge ist gross wenn es ploetzlich im brustkorb sticht viele betroffene denken sofort an einen herzinfarkt oder auch an brustkrebs mit haltungsschaeden und muskelverspannungen wird das stechen nicht in verbindung gebracht nicht selten aber ist das bruststechen die folge von haltungsschaeden durch zu vieles sitzen und zu wenig bewegung bei brustschmerzen muss natuerlich als erstes ein arzt andere ursachen wie herzinfarkt rippenbrueche oder erkrankungen der lunge ausschliessen sagt peter ivanits facharzt fuer orthopaedie aus frankfurt am main aber bei vielen patienten ruehre das stechen in der brust das meist im sitzen oder anderen ruhephasen auftrete von einer verspannten rueckenmuskulatur mit massagen und krankengymnastik bekommen wir die akuten beschwerden schnell in der griff sagt ivanits aber ohne vorbeugung koennten die brustschmerzen regelmaessig wiederkehren die beste praevention ist regelmaessiger sport sagt ivanits im alltag koennten aber auch ein paar einfache regeln schon helfen +Gesundheit de-DE url-pathpart-~r url-pathpart-rp url-pathpart-online url-pathpart-rss url-pathpart-gesundheit url-pathpart-~3 url-pathpart-nsxWjaU67aA url-domainname-feeds.rp-online.de url-domainprefix-feeds url-domainprefix-feeds.rp-online url-domaintype-de url-domainsuffix-rp-online.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm steve jobs seltene krebskrankheit sieben jahre ueberleben bedeuten einen sieg rp unter medizinern wurde der fall steve jobs seit langem kontrovers diskutiert auf der einen seite +Ausland de-DE url-pathpart-politik url-pathpart-article2055267 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Menschenjagd-auf-Kairos-Strassen-nach-Christen-Protesten url-pagepartsplitname-Menschenjagd url-pagepartsplitname-auf url-pagepartsplitname-Kairos url-pagepartsplitname-Strassen url-pagepartsplitname-nach url-pagepartsplitname-Christen url-pagepartsplitname-Protesten url-lastpartpagename-html menschenjagd auf kairos strassen nach christen protesten hamburg kairo als tausende christliche kopten gemeinsam mit vielen muslimen in kairo auf die strasse gingen um gegen die wachsende diskriminierung der kopten zu demonstrieren sollte dies ein zeichen friedlicher solidaritaet setzen doch das chaos das dann daraus erwuchs gefaehrdet die gesamte gesellschaftliche und politische entwicklung im bevoelkerungsreichsten arabischen staat ploetzlich flogen steine dann fielen schuesse schliesslich marschierte die armee auf ein militaerfahrzeug raste vor und zurueck durch die aufgebrachte menge und ueberrollte mehrere menschen am ende waren nach offiziellen angaben mindestens 26 menschen tot und weit mehr als 200 verletzt inoffizielle beobachter sprachen gar von 36 toten und mehr als 300 verletzten es waren die schlimmsten unruhen seit den wirren im zusammenhang mit dem sturz des langjaehrigen despoten husni mubarak im februar die bundesregierung in berlin forderte die fuehrung in kairo auf fuer ein klima religioeser toleranz zu sorgen und die vorgaenge so schnell wie moeglich aufzuklaeren wie regierungssprecher steffen seibert sagte bundesaussenminister guido westerwelle fdp verurteilte die gewalt am rande des eu ministerrats in luxemburg scharf wer als christ seinen glauben praktizieren moechte muss das frei tun koennen ohne dass er koerperlich bedroht wird oder um sein leben fuerchten muss sagte westerwelle die eu aussenbeauftragte catherine ashton sagte die europaeische union erwarte dass aegypten menschen jeder glaubensrichtung schuetze 26 tote sagte ashton entsetzt niemand auf der welt gibt menschen das recht einen religionskampf zu fuehren der in kairo herrschende militaerrat verstaerkte die sicherheitsmassnahmen verhaengte ueber teile kairos eine naechtliche ausgangssperre und zog zusaetzliche truppen vor dem parlamentsgebaeude und anderen zentralen einrichtungen zusammen die unruhen hatten am gebaeude des staatlichen fernsehens begonnen und sich dann rasch auf andere teile der hauptstadt inklusive des tahrir platzes ausgeweitet dort hatte die revolte gegen mubarak ihren ausgang genommen ausloeser der juengsten koptischen proteste waren ein angriff radikaler muslime auf die kirche des dorfes mari nab bei edfu die teilweise zerstoert wurde und der konflikt um eine schule in der provinz minia gewesen in der christliche maedchen gezwungen worden waren mit kopftuechern zu erscheinen die kopten werfen der neuen regierung in kairo vor sie nicht vor uebergriffen zu schuetzen und fordern den ruecktritt des gouverneurs der provinz assuan der die angriffe auf die kirche gerechtfertigt habe gouverneur mustafa al sajjid hatte behauptet das gotteshaus sei ohne genehmigung errichtet worden es war nicht die erste koptische kirche die in flammen aufging die radikalislamische salafistenbewegung wies jede verantwortung fuer die unruhen zurueck nach dem sturz mubaraks haben radikale islamisten in aegypten starken auftrieb erhalten rund 1000 soldaten und polizisten griffen in die sich rasch entwickelnden handfesten auseinandersetzungen zwischen kopten und militanten muslimen ein der staatliche fernsehsender nil tv wurde unter anderem von der internationalen gesellschaft fuer menschenrechte igfm beschuldigt sehr einseitig zulasten der kopten berichtet zu haben beobachter sprachen von hetze so meldete das staatliche programm channel 1 die kopten haetten zwei soldaten erschossen obwohl deren todesumstaende voellig ungeklaert waren und rief zur unterstuetzung der muslime auf diese stroemten dann herbei und attackierten die kopten ein rasender mob machte regelrecht jagd auf autos in denen koptische christen vermutet wurden die fahrzeuge und ihre insassen wurden angegriffen wie augenzeugen berichteten griffen armee und polizei nicht dagegen ein die groesste aegyptische zeitung al ahram berichtete anders als das fernsehen und schrieb der friedliche prokoptische protestzug sei von provokateuren radikalen muslimen und soldaten mit steinen und schuessen angegriffen worden ministerpraesident essam scharaf rief die aegypter zur ruhe auf es handle sich keineswegs um religionsunruhen sondern um eine verschwoerung der premier fuegte hinzu +Ausland de-DE url-pathpart-politik url-pathpart-article2006955 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-28-Menschen-sterben-bei-Attentat-auf-Moschee url-pagepartsplitname-28 url-pagepartsplitname-Menschen url-pagepartsplitname-sterben url-pagepartsplitname-bei url-pagepartsplitname-Attentat url-pagepartsplitname-auf url-pagepartsplitname-Moschee url-lastpartpagename-html 28 menschen sterben bei attentat auf moschee bagdad dubai bei einem selbstmordanschlag sind in der irakischen hauptstadt bagdad mindestens 28 menschen getoetet worden 37 weitere personen wurden verletzt wie der arabische fernsehsender al arabija am sonntag unter berufung auf das irakische innenministerium weiter mitteilte der attentaeter habe sich nach dem abendgebet in einer sunnitischen moschee im westen bagdads in die luft gesprengt unter den toten war den angaben zufolge auch ein parlamentsabgeordneter die um al kura moschee im stadtviertel al dschamiaah ist das groesste sunnitische gotteshaus in der irakischen hauptstadt die bluttat weckte erinnerungen an einen anschlag auf einen schiitischen schrein in der sunnitischen stadt samarra im jahr 2006 der eine welle sektiererisch motivierte gewalt im irak ausloeste dpa dapd +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-pathpart-krise url-pathpart-in url-pathpart-der url-pathpart-arabischen url-pathpart-welt url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-libyen-rebellen-sind-gaddafi-auf-der-spur_aid_659357 url-pagepartsplitname-libyen url-pagepartsplitname-rebellen url-pagepartsplitname-sind url-pagepartsplitname-gaddafi url-pagepartsplitname-auf url-pagepartsplitname-der url-pagepartsplitname-spur url-pagepartsplitname-aid url-pagepartsplitname-659357 url-lastpartpagename-html libyen rebellen sind gaddafi auf der spur nach der eroberung von tripolis durch libysche rebellen stehen jetzt auch muammar el gaddafis letzte machtbastionen im land vor dem fall weiter unklar ist wo der diktator untergetaucht ist angeblich verfolgen die rebellen einen verdaechtigen lkw konvoi die kaempfe zwischen aufstaendischen und gaddafi treuen truppen konzentrierten sich am freitag auf die beiden verbliebenen hochburgen des alten regimes gaddafis heimatstadt sirte sowie die wuestenstadt sebha im zentrum des landes dagegen flauten die kaempfe in tripolis nach fernsehberichten deutlich ab weiter unklar ist wo der langjaehrige diktator untergetaucht ist nach einem medienbericht sollen die aufstaendischen einem lastwagen konvoi verfolgen in dem sie gaddafi vermuten sie hofften die fahrzeuge etwa 40 bis 50 kilometer vor tripolis abfangen zu koennen sagte sicherheitsberater abdul karim basama vom uebergangsrat der maltesischen zeitung the times of malta es solle verhindert werden dass gaddafi nach sirte oder sebha durchkomme derweil beklagte bundesverteidigungsminister thomas de maiziere cdu fehler der internationalen staatengemeinschaft bei der einschaetzung der lage in libyen wir haben uns mindestens dreimal geirrt nicht nur wir deutschen der ganze westen sagte er in der bundesakademie fuer sicherheitspolitik in berlin gaddafi sei erst massiv unterschaetzt und dann massiv ueberschaetzt worden dann haben wir gesagt es wird lange anhalten und es gibt ueberhaupt keine veraenderungen und in einer woche war tripolis erobert der chef der libyschen uebergangsregierung mahmud dschibril erklaerte die aufstaendischen haetten inzwischen fast im ganzen land die oberhand nur sebha sirte sowie das suedoestlich von tripolis gelegene bani walid seien noch nicht unter kontrolle sagte er nach angaben der tuerkischen nachrichtenagentur anadolu in ankara ziel sei es die staedte ohne blutvergiessen einzunehmen britische kampfflugzeuge feuerten in der nacht zum freitag raketen auf eine kommando und kontrollzentrale in sirte ab gaddafi gegner riefen die einwohner auf die stadt kampflos zu uebergeben im gegenzug sollten nur aus sirte stammende kaempfer in die kuestenstadt einruecken hiess es die aufstaendischen sammelten unterdessen ihre einheiten nahe sirte wo sich einheiten und anhaenger gaddafis verschanzt haben in der garnisonsstadt sebha lieferten sich anhaenger und gegner gaddafis heftige kaempfe dutzende rebellen seien getoetet worden als sie das hauptquartier des militaergeheimdienstes gestuermt haetten teilten die aufstaendischen mit sie haetten zwei stadtviertel eingenommen es gebe kein wasser und keinen strom in tripolis sagten die neuen machthaber ein auf dem zentralen gruenen platz geplantes freitagsgebet aus sicherheitsgruenden ab die glaeubigen versammelten sich in der nahe gelegenen el kabir moschee der prediger scheich ahmed milad gaddur rief die libyer auf keine rache zu ueben unterdessen kommen immer mehr grausamkeiten der vergangenen tage ans licht reporter der fernsehsender el dschasira und bbc berichteten von graeueltaten auf beiden seiten el dschasira zeigte bilder von leichen in gruenanalagen in dem noch am donnerstag besonders heftig umkaempften stadtteil von tripolis abu salim ein reporter beschrieb die lage in einem nahe gelegenen krankenhaus als katastrophal dort stapelten sich die leichen nach informationen der menschenrechtsorganisation amnesty international haben gaddafi anhaenger womoeglich mehr als 100 gefangene in militaercamps nahe tripolis brutal getoetet die gaddafi treuen truppen haetten granaten geworfen und mit schusswaffen auf die gefangenen gefeuert +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-pathpart-krise url-pathpart-in url-pathpart-der url-pathpart-arabischen url-pathpart-welt url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-libyen-china-bot-gaddafi-grosse-mengen-waffen-an_aid_662179 url-pagepartsplitname-libyen url-pagepartsplitname-china url-pagepartsplitname-bot url-pagepartsplitname-gaddafi url-pagepartsplitname-grosse url-pagepartsplitname-mengen url-pagepartsplitname-waffen url-pagepartsplitname-an url-pagepartsplitname-aid url-pagepartsplitname-662179 url-lastpartpagename-html libyen china bot gaddafi grosse mengen waffen an china hat sich offenbar ueber un sanktionen hinweg gesetzt und dem gaddafi regime vor seinem zerfall grosse mengen an waffen und munition angeboten die chinesischen waffenhaendler wollten bei bedarf sogar auf bestellung liefern chinesische firmen haetten den truppen von libyens langjaehrigem machthaber muammar gaddafi im juli den verkauf von raketenwerfern panzerabwehrraketen und anderen waffen im wert von rund 200 millionen dollar angeboten berichtete die new york times am montag zuvor hatte auch die kanadische the globe and mail davon berichtet wir haben hinweise dass es geschaeftsbeziehungen zwischen china und gaddafi gegeben hat sagte rebellensprecher abdulrahman busin der new york times und wir haben alle dokumente die dies beweisen demnach reiste eine libysche delegation mitte juli nach china und sprach mit vertretern verschiedener ruestungskonzerne die konzerne haetten angeboten ihre gesamten lagerbestaende zu verkaufen und bei bedarf auf bestellung zu produzieren die lieferungen sollten demnach ueber die drittstaaten algerien und suedafrika erfolgen die chinesische seite habe darauf hingewiesen dass ein teil der waffen bereits in algerien gelagert sei und schnell ueber die grenze nach libyen gebracht werden koenne die gastgeber haetten sich bei den libyern fuer deren diskretion bedankt und auf die noetige vertraulichkeit hingewiesen ob es zu lieferungen gekommen sei gehe aus den dokumenten nicht hervor vertreter der neuen libyschen fuehrung in tripolis sagten die dokumente bestaetigten den verdacht dass es eine enge zusammenarbeit der gaddafi fuehrung mit china algerien und suedafrika gegeben habe der militaerbeauftragte des nationalen uebergangsrates omar hariri sagte er sei sich ziemlich sicher dass waffen in libyen eingetroffen und gegen das libysche volk verwendet worden seien die dokumente wurden auf einem muellhaufen in einem stadtteil von tripolis entdeckt in dem viele mitglieder der gaddafi fuehrung lebten china ist das einzige staendige mitglied im un sicherheitsrat das den uebergangsrat bislang nicht als legitimen vertreter libyens anerkannt hat eine bestaetigung fuer die berichte oder die echtheit der dokumente gab es nicht ein nato und ein un vertreter bezeichneten sie der new york times zufolge jedoch als hoechst unwahrscheinlich den vereinten nationen sei nichts von waffengeschaeften mit china bekannt sagte das mitglied einer un kommission zur ueberwachung des libyen embargos das chinesische aussenministerium raeumte ein dass mitglieder von gaddafis regierung auf chinesische ruestungsfirmen zugegangen seien dies sei aber ohne wissen der chinesischen regierung passiert sagte ministeriumssprecherin jian yu die chinesischen firmen haben keine entsprechenden vertraege unterschrieben noch haben sie militaergueter nach libyen ausgeliefert nachdem der un sicherheitsrat die resolution verabschiedet habe seien die entsprechenden regierungsstellen angewiesen worden sie strengstens einzuhalten +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-afghanistan-auswaertiges-amt-bestaetigt-tod-eines-deutschen-urlaubers_aid_668931 url-pagepartsplitname-afghanistan url-pagepartsplitname-auswaertiges url-pagepartsplitname-amt url-pagepartsplitname-bestaetigt url-pagepartsplitname-tod url-pagepartsplitname-eines url-pagepartsplitname-deutschen url-pagepartsplitname-urlaubers url-pagepartsplitname-aid url-pagepartsplitname-668931 url-lastpartpagename-html afghanistan auswaertiges amt bestaetigt tod eines deutschen urlaubers ein ministeriumssprecher sagte am sonntag in berlin die untersuchungen haetten ergeben dass es sich bei einer der beiden in der westafghanischen provinz ghor getoeteten personen um einen deutschen staatsangehoerigen handele nach angaben der afghanischen behoerden hatten bewaffnete angreifer den deutschen touristen sowie seinen afghanischen begleiter am samstag in der als relativ sicher geltenden region erschossen die provinz ghor gilt normalerweise als verhaeltnismaessig ruhig der tourist sei aber von den behoerden gewarnt worden dass seine sicherheit nicht gewaehrleistet werden koenne sagte vize polizeichef abdul raschid die leiche des deutschen sei in die hauptstadt kabul gebracht worden +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-somalia-kalaschnikows-fuer-gute-koranschueler_aid_667250 url-pagepartsplitname-somalia url-pagepartsplitname-kalaschnikows url-pagepartsplitname-fuer url-pagepartsplitname-gute url-pagepartsplitname-koranschueler url-pagepartsplitname-aid url-pagepartsplitname-667250 url-lastpartpagename-html somalia kalaschnikows fuer gute koranschueler ein von der radikalislamischen el schabaab miliz betriebener radiosender in somalia hat einen rezitierwettbewerb fuer kinder veranstaltet als hauptpreis gab es nicht nur geld zu gewinnen sondern auch jede menge schwere waffen der gewinner ging mit einem ak 47 kalaschnikow sturmgewehr und einem geldpreis in hoehe von umgerechnet 500 euro nach hause wie der britische sender bbc am dienstag berichtete der zweiplatzierte erhielt ebenfalls eine kalaschnikow und eine summe von 350 euro waehrend der drittplatzierte den wettstreit mit zwei handgranaten und etwa 300 euro verliess zudem erhielten die teilnehmer religioese buecher hiess es kinder sollten eine hand fuer ihre erziehung benutzen und in der anderen eine waffe halten um den islam zu beschuetzen sagte el schabaab sprecher mukhtar robow bei der preisverleihung auch raketenwerfer gab es schon als hautpgewinn der wettbewerb des radiosenders andulus wurde im fastenmonat ramadan in elasha 20 kilometer von der hauptstadt mogadischu entfernt ausgetragen zur teilnahme aufgerufen waren kinder und jugendliche zwischen zehn und 17 jahren in den vergangenen zwei jahren hatten kinder bei aehnlichen wettbewerben bereits um einen raketenwerfer als hauptgewinn gerungen die rebellen der el schabaab kontrollieren grosse teile des suedlichen und zentralen somalias wo derzeit eine verheerende duerre herrscht die miliz bekaempft die uebergangsregierung in mogadischu und will am horn von afrika einen islamischen gottesstaat einrichten der sich an einem weltweiten dschihad beteiligt +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-drohung-von-el-kaida-100-anschlaege-als-rache-fuer-bin-laden_aid_657300 url-pagepartsplitname-drohung url-pagepartsplitname-von url-pagepartsplitname-el url-pagepartsplitname-kaida url-pagepartsplitname-100 url-pagepartsplitname-anschlaege url-pagepartsplitname-als url-pagepartsplitname-rache url-pagepartsplitname-fuer url-pagepartsplitname-bin url-pagepartsplitname-laden url-pagepartsplitname-aid url-pagepartsplitname-657300 url-lastpartpagename-html drohung von el kaida 100 anschlaege als rache fuer bin laden dem irak droht eine neue welle der gewalt +Ausland de-DE url-pathpart-politik url-pathpart-ausland url-pathpart-article2014661 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Gaddafi-verhoerte-Terrorverdaechtige-fuer-die-CIA url-pagepartsplitname-Gaddafi url-pagepartsplitname-verhoerte url-pagepartsplitname-Terrorverdaechtige url-pagepartsplitname-fuer url-pagepartsplitname-die url-pagepartsplitname-CIA url-lastpartpagename-html gaddafi verhoerte terrorverdaechtige fuer die cia tripolis die zentrale des libyschen geheimdienstes lis liegt mitten in tripolis beiderseits der strasse sind rebellen postiert vor dem eingangstor stehen zwei gelaendefahrzeuge mit schweren geschuetzen auf der ladeflaeche schwere bewachung kein wunder +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-roland-emmerich-macht-actionfilme-liebe-1436652 url-pagepartsplitname-roland url-pagepartsplitname-emmerich url-pagepartsplitname-macht url-pagepartsplitname-actionfilme url-pagepartsplitname-liebe url-pagepartsplitname-1436652 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss roland emmerich macht actionfilme mit liebe nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-pete-doherty-kann-seine-strafe-nicht-bezahlen-1453565 url-pagepartsplitname-pete url-pagepartsplitname-doherty url-pagepartsplitname-kann url-pagepartsplitname-seine url-pagepartsplitname-strafe url-pagepartsplitname-nicht url-pagepartsplitname-bezahlen url-pagepartsplitname-1453565 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss pete doherty rocker pete doherty 32 will seine schulden bei der berliner justiz in raten abstottern der musiker habe ein entsprechendes gesuch gestellt teilte staatsanwaltschaftssprecher martin steltner am mittwoch mit das muss geprueft werden der musiker hatte sich einen strafbefehl von 30 000 euro eingehandelt weil er im dezember 2009 in kreuzberg bei einer kneipentour ein auto demoliert haben soll noch gebe es keinen haftbefehl aber doherty muesse damit rechnen wenn er seine strafe nicht zahle erklaerte steltner zu einem bericht der zeitung b z wenn der musiker die summe begleiche sei die angelegenheit erledigt und dem musiker bleibe eine ersatzfreiheitsstrafe von 30 tagen hinter gittern erspart nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Unterhaltung de-DE url-pathpart-kultur url-pathpart-kino url-pathpart-tv url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-late-night-show-ohne-zuschauer-harald-schmidts-magere-quoten_aid_674601 url-pagepartsplitname-late url-pagepartsplitname-night url-pagepartsplitname-show url-pagepartsplitname-ohne url-pagepartsplitname-zuschauer url-pagepartsplitname-harald url-pagepartsplitname-schmidts url-pagepartsplitname-magere url-pagepartsplitname-quoten url-pagepartsplitname-aid url-pagepartsplitname-674601 url-lastpartpagename-html late night show ohne zuschauer harald schmidts magere quoten die harald schmidt show hat noch immer wenig zuschauer bei den 14 bis 49 jaehrigen schauen nur 9 1 prozent die late night show schmidts sender sat 1 betrachtet die quoten ganz entspannt harald schmidts quoten sind weiterhin schlecht nach angabens seines senders sat 1 erreicht der moderator mit seiner late night show durchschnittlich derzeit nur 9 1 prozent in der werberelevanten zielgruppe der 14 bis 49 jaehrigen die zahl der zuschauer liegt bei 510 000 beim gesamtpublikum sind es im schnitt 830 000 und damit 6 9 prozent der sender sat 1 sieht die bisher mageren quoten seines neuen late night talkers dennoch gelassen wir betrachten die quoten ganz entspannt sagte sat 1 sprecherin diana schardt auf anfrage uns ist bewusst dass es etwas zeit benoetigt bis sich die zuschauer an die regelmaessige sendezeit um 23 15 uhr an zwei aufeinanderfolgenden tagen gewoehnen die harald schmidt show war am 13 september gestartet schmidt der bereits zwischen 1995 bis 2003 fuer sat 1 gearbeitet hatte moderierte zuvor eine late night show in der ard sein neuer vertrag mit dem privatsender enthaelt nach aussagen des moderators explizit keine quotenvorgaben +Sport de-DE url-pathpart-c url-pathpart-32191 url-pathpart-f url-pathpart-443319 url-pathpart-s url-pathpart-17f65946 url-pathpart-l url-pathpart-0L0Sfocus0Bde0Csport0Cmehrsport0Cbasketball0Ewm0Ebasketball0Eem0Eerster0Esieg0Efuer0Eisrael0Iaid0I6620A570Bhtml url-domainname-rss2.focus.de url-domainprefix-rss2 url-domainprefix-rss2.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm basketball wm basketball em israel ist bei der basketball em der erste sieg gelungen der bisherige tabellenletzte der deutschen gruppe b setzte sich mit 91 +Sport de-DE url-pathpart-sport url-pathpart-fussball url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-junioren-u17-auswahl-startet-mit-sieg-in-em-qualifikation_aid_674504 url-pagepartsplitname-junioren url-pagepartsplitname-u17 url-pagepartsplitname-auswahl url-pagepartsplitname-startet url-pagepartsplitname-mit url-pagepartsplitname-sieg url-pagepartsplitname-in url-pagepartsplitname-em url-pagepartsplitname-qualifikation url-pagepartsplitname-aid url-pagepartsplitname-674504 url-lastpartpagename-html junioren u17 auswahl startet mit sieg in em qualifikation die u17 auswahl des deutschen fussball bundes ist mit einem klaren sieg in die erste qualifikationsrunde zur europameisterschaft 2012 gestartet die u17 auswahl des deutschen fussball bundes dfb ist mit einem klaren sieg in die erste qualifikationsrunde zur europameisterschaft 2012 gestartet die mannschaft von dfb trainer stefan boeger setzte sich in tallinn gegen gastgeber estland mit 5 +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-article13608441 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Pflanzen-legen-Wasservorrat-im-Boden-an url-pagepartsplitname-Pflanzen url-pagepartsplitname-legen url-pagepartsplitname-Wasservorrat url-pagepartsplitname-im url-pagepartsplitname-Boden url-pagepartsplitname-an url-lastpartpagename-html vorratswirtschaft pflanzen sondern im wurzelbereich eine wabbelige substanz ab die hilft ihnen wasser im boden speichern zu koennen so koennen sie trockenphasen ueberbruecken pflanzen koennen mit wasser im boden eine art vorratswirtschaft betreiben zumindest koennen sie nach neuen forschungsergebnissen in der naehe ihrer wurzeln so viel wasser speichern dass sie damit kuerzere trockenperioden ueberstehen koennen das haben wissenschaftler der universitaet goettingen zusammen mit kollegen aus potsdam leipzig und den usa herausgefunden foto +Technik & Wissen de-DE url-pathpart-wissenschaft url-pathpart-article13608359 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Bitte-wer-Das-ist-Albert-Szent-Gyoergyi url-pagepartsplitname-Bitte url-pagepartsplitname-wer url-pagepartsplitname-Das url-pagepartsplitname-ist url-pagepartsplitname-Albert url-pagepartsplitname-Szent url-pagepartsplitname-Gyoergyi url-lastpartpagename-html google doodle die suchmaschine ueberrascht wieder einmal auf ihrer startseite was mag sich hinter diesem etikett einer obstkiste verstecken es ist albert szent gyoergyi albert szent wer frisches obst auf der startseite von google das google doodle ein gekritzel aus besonderem anlass das das unternehmslogo von google verfremdet darstellt hatte zuletzt mit einer hommage an freddy mercury den 1991 verstorbenen saenger der rockband queen ueberrascht foto +Technik & Wissen de-DE url-pathpart-wissen url-domainname-www.stern.de url-domainprefix-www url-domainprefix-www.stern url-domaintype-de url-domainsuffix-stern.de url-firstpartpagename-qumran-rollen-museum-stellt-uralte-bibelschriften-online-1732537 url-pagepartsplitname-qumran url-pagepartsplitname-rollen url-pagepartsplitname-museum url-pagepartsplitname-stellt url-pagepartsplitname-uralte url-pagepartsplitname-bibelschriften url-pagepartsplitname-online url-pagepartsplitname-1732537 url-lastpartpagename-html#utm_source=standard&utm_medium=rssfeed&utm_campaign=wissen video die 2000 jahre alten qumran schriften sind jetzt online zugaenglich das israelische nationalmuseum hat die schriftrollen digitalisiert und im internet auf einer eigens von google entwickelten datenbank veroeffentlicht +Technik & Wissen de-DE url-pathpart-2011 url-pathpart-16 url-domainname-www.zeit.de url-domainprefix-www url-domainprefix-www.zeit url-domaintype-de url-domainsuffix-zeit.de url-firstpartpagename-Strahlung url-pagepartsplitname-Strahlung geschichte der radioaktivitaet frau roentgen ist verstimmt sie hat bereits dreimal nach ihrem mann geschickt um ihn zum abendessen zu holen erst kommt er gar nicht dann sitzt er schweigend am tisch isst nur ein paar bissen und verschwindet auch schon wieder im labor in jenen novembertagen 1895 arbeitet der wuerzburger physiker wilhelm conrad roentgen wie besessen an einer interessanten entdeckung am 28 dezember macht er sie mit seiner studie ueber eine neue art von strahlen oeffentlich und einen monat spaeter berichtet er in einer vorlesung erstmals ueber die geheimnisvollen x strahlen er belaesst es aber nicht bei worten sondern holt den schweizer anatomie professor rudolf albert von koelliker aus dem auditorium nach vorn koelliker muss seine hand auf eine belichtungsplatte legen dann jagt roentgen strom durch eine seltsam geschwaerzte roehre anschliessend haelt er den gebannten zuhoerern die belichtete aufnahme vor die nase sie zeigt deutlich erkennbar die handknochen des kollegen roentgens entdeckung ist ebenso spektakulaer wie leicht verstaendlich der blick durch die aeussere materie ins innenleben von menschen und dingen loest einen taumel der begeisterung aus und revolutioniert die medizin in rasendem tempo die new york sun spricht von einem triumph der wissenschaft +Technik & Wissen de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-174431 url-pagepartsplitname-174431 url-lastpartpagename-html viel zoff um einen stoff ein erbitterter streit geht in die naechste runde gegenstand des konflikts ist eine farblose kristalline chemikalie sie ist unverzichtbar fuer die herstellung von polycarbonat kunststoffen und einigen kunstharzen +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-news url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-illegaler-insider-handel-wall-street-mogul-muss-elf-jahre-hinter-gitter_aid_674503 url-pagepartsplitname-illegaler url-pagepartsplitname-insider url-pagepartsplitname-handel url-pagepartsplitname-wall url-pagepartsplitname-street url-pagepartsplitname-mogul url-pagepartsplitname-muss url-pagepartsplitname-elf url-pagepartsplitname-jahre url-pagepartsplitname-hinter url-pagepartsplitname-gitter url-pagepartsplitname-aid url-pagepartsplitname-674503 url-lastpartpagename-html illegaler insider handel wall street mogul muss elf jahre hinter gitter nach einem spektakulaeren prozess hat ein new yorker gericht den wall street mogul raj rajaratnam wegen insider handels zu elf jahren gefaengnis verurteilt weil der haendler krank ist und eifrig spendet fiel die strafe nicht noch haerter aus das gericht setzte damit am donnerstag ein geringeres strafmass gegen den 53 jaehrigen milliardaer fest als von der staatsanwaltschaft gefordert rajaratnam der den galleon hedgefonds gegruendet hatte muss ab ende november in haft und soll zuvor ein bussgeld in hoehe von zehn millionen dollar knapp 7 3 millionen euro zahlen er wurde am 11 mai schuldig gesprochen zwischen 2003 und 2009 millionenschwere insidergeschaefte verantwortet zu haben insgesamt soll er damit rund 72 millionen dollar etwa 52 5 millionen euro erwirtschaftet haben die staatsanwaltschaft hatte eine haftstrafe von mindestens neunzehneinhalb jahren gefordert richter richard holwell begruendete das mildere strafmass mit gesundheitlichen problemen rajaratnams der aus sri lanka stammende finanzmogul habe diabetes in fortgeschrittenem stadium was zu einer nierenstoerung fuehre sagte holwell als weiteren grund nannte der richter rajaratnams grosszuegige spendentaetigkeit die weit ueber das uebliche mass hinausreiche ueberdies seien insidergeschaefte weniger gefaehrlich als betrugsmodelle nach dem schneeballsystem wie sie zum beispiel von dem 2009 verurteilten milliardenbetrueger bernard madoff angewendet wurden das gegen rajaratnam festgesetzte strafmass ist laut wall street journal das hoechste das in den vergangenen 20 jahren in den usa wegen insidergeschaeften verhaengt wurde die verteidigung hatte auf maximal sechseinhalb jahre haft plaediert da der von rajaratnam mit den insidergeschaeften erwirtschaftete gewinn weniger als acht millionen dollar 5 8 millionen euro betrage +Gesundheit de-DE url-pathpart-ratgeber url-pathpart-gesundheit url-pathpart-article1971240 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Testosteron-schuetzt-vor-Entzuendungen-und-Allergien url-pagepartsplitname-Testosteron url-pagepartsplitname-schuetzt url-pagepartsplitname-vor url-pagepartsplitname-Entzuendungen url-pagepartsplitname-und url-pagepartsplitname-Allergien url-lastpartpagename-html testosteron schuetzt vor entzuendungen und allergien jena das maennliche erscheinungsbild wird in grossem masse vom geschlechtshormon testosteron gesteuert es laesst haare und muskeln wachsen hinzu kommt offenbar noch eine abwehrstaerkende wirkung des hormons deshalb leiden maenner seltener unter entzuendungen und allergien als frauen das teilte die universitaet jena am dienstag mit bestimmte zellen erzeugen demnach im weiblichen koerper fast doppelt so viele stoffe die eine entzuendung foerdern wie im maennlichen organismus daher wuerden vor allem frauen durch arthritis oder asthma geplagt doch der wirkung maennlicher hormone koennen sich auch die weiblichen immunzellen nicht entziehen im reagenzglas habe sich gezeigt dass testosteron in den zellen beider geschlechter die enzyme hemmt die fuer eine entzuendliche reaktion sorgen die jenaer wissenschaftler carlo pergola und oliver werz fordern deshalb massgeschneiderte therapien fuer maenner und frauen +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-soziales url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,792948,00 url-pagepartsplitname-0,1518,792948,00 url-lastpartpagename-html#ref=rss konjunkturprognose die bundesregierung rechnet fuer 2012 mit einer spuerbar schwaecheren konjunktur doch die buerger sollen das nicht spueren im gegenteil +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-bundespraesident-in-tokio-wulff-reist-ins-japanische-katastrophengebiet_aid_677623 url-pagepartsplitname-bundespraesident url-pagepartsplitname-in url-pagepartsplitname-tokio url-pagepartsplitname-wulff url-pagepartsplitname-reist url-pagepartsplitname-ins url-pagepartsplitname-japanische url-pagepartsplitname-katastrophengebiet url-pagepartsplitname-aid url-pagepartsplitname-677623 url-lastpartpagename-html bundespraesident in tokio wulff reist ins japanische katastrophengebiet nach seiner ankunft in japan trifft bundespraesident wulff den kaiser japans regierung ergreift jetzt die initiative und fordert den betreiber tepco auf 24 milliarden euro einzusparen sonst gibt es kein geld vom staat am zweiten tag seines besuchs in japan ist bundespraesident christian wulff in tokio mit kaiser akihito und mit hochrangigen japanischen unternehmern zusammengetroffen akihito begruesste den deutschen staatsgast am montag im kaiserlichen palast in der innenstadt von tokio offizieller anlass des fuenftaegigen besuchs ist der 150 jahrestag der aufnahme diplomatischer beziehungen zwischen tokio und berlin bei dem treffen mit dem kaiser und einem anschliessenden mittagessen fuer die delegation duerfte es aber auch um die folgen der reaktorkatastrophe von fukushima gegangen sein bereits nach seiner ankunft am sonntag hatte wulff dem japanischen volk weiter solidaritaet und unterstuetzung aus deutschland zugesichert am montagabend trifft wulff auch mit regierungschef yoshihiko noda und anderen fuehrenden politikern des landes zusammen am dienstag will sich der bundespraesident im katastrophengebiet selbst ein bild von der lage machen und mit menschen sprechen die immer noch in behelfsunterkuenften leben auch sieben monate nach erdbeben tsunami und atomunfall sind noch zehntausende ohne obdach auch mit vorstandsvorsitzenden japanischer konzerne eroerterte wulff am montag konsequenzen aus dem atomunfall von fukushima ueber die zukunft der kernkraft gab es unterschiedliche positionen die nach angaben aus delegationskreisen vor dem bundespraesidenten offen diskutiert wurden wulff sprach mit den unternehmern auch ueber bestehende barrieren fuer deutsche unternehmen etwa bei investitionen in den bereichen energie und umwelttechnik in japan er hob die bedeutung des standortes hervor und sagte +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-boerse url-domainname-www.manager-magazin.de url-domainprefix-www url-domainprefix-www.manager-magazin url-domaintype-de url-domainsuffix-manager-magazin.de url-firstpartpagename-0,2828,792885,00 url-pagepartsplitname-0,2828,792885,00 url-lastpartpagename-html#ref=rss langfristige aktien investments guenstige bewertungen locken derzeit so genannte value investoren an die boerse gute voraussetzungen fuer erfolgreiches langfristiges investieren bringen wissenschaftlern zufolge gerade deutsche anleger mit aber bestehen sie damit auch den praxistest hamburg an der boerse schlaegt gegenwaertig die stunde der value investoren das sind jene anleger die per fundamentalanalyse also durch moeglichst praezise ermittlung des unternehmenswertes sowie vergleich dessen mit der boersenbewertung guenstige papiere identifizieren und dann kaufen getreu einem populaeren rat des einstigen boersengurus andre kostolany wonach sich anleger aktien ins depot legen und dann ein paar jahre schlafen sollen werden die papiere in der folge eisern gehalten damit sie langfristig die erhoffte rendite bringen so zumindest die theorie experten zufolge koennte der zeitpunkt fuer ein value investment zurzeit besser kaum sein das ist ganz offensichtlich sagt etwa hendrik leber chef des value spezialisten acatis zu manager magazin online momentan bekommt man am aktienmarkt viel gute qualitaet fuer wenig geld leber haelt vor allem viele firmen der oel gas und energiebranche fuer attraktiv wie etwa die deutschen versorger eon und rwe oder die tschechische cez gleiches gilt fuer den industriezweig medizintechnik beispielsweise mit der firma medtronic die unter anderem implantate wie herzschrittmacher herstellt als prominentester und wohl auch erfolgreichster value investor weltweit gilt der us amerikanische multimilliardaer warren buffett chef der anlagefirma berkshire hathaway und einst schueler von benjamin graham dem urvater dieser anlagestrategie beide wussten +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-news url-domainname-www.stern.de url-domainprefix-www url-domainprefix-www.stern url-domaintype-de url-domainsuffix-stern.de url-firstpartpagename-herabstufung-der-kreditwuerdigkeit-moodys-droht-frankreich-1739890 url-pagepartsplitname-herabstufung url-pagepartsplitname-der url-pagepartsplitname-kreditwuerdigkeit url-pagepartsplitname-moodys url-pagepartsplitname-droht url-pagepartsplitname-frankreich url-pagepartsplitname-1739890 url-lastpartpagename-html#utm_source=standard&utm_medium=rssfeed&utm_campaign=wirtschaft herabstufung der kreditwuerdigkeit die ratingagentur moody s prueft die herabstufung von frankreichs kreditwuerdigkeit innerhalb der kommenden drei monate werde moody s bewerten inwieweit die franzoesische regierung die angekuendigten massnahmen zur reduzierung des staatsdefizits umsetze erklaerte die ratingagentur am montag anhand dessen solle entschieden werden ob frankreich das derzeit mit der bestnote aaa bewertet wird noch die bedingungen fuer die perspektive stabil erfuelle neben moody s bewerten derzeit auch die beiden anderen grossen ratingagenturen standard poor s und fitch frankreich mit der bestnote diese ermoeglicht es dem land auf den internationalen finanzmaerkten unter guenstigen bedingungen kredite aufzunehmen sollte moody s zu dem ergebnis kommen dass die note mit einer negativen perspektive einhergeht waere frankreich nach den usa das zweite grosse land dessen kreditwuerdigkeit herabgestuft wird moody s betonte dass es sich bei der studie um eine routinemaessig einmal jaehrlich stattfindene ueberpruefung der finanzlage frankreichs handle die agentur ist aber die erste der drei grossen konkurrenten die die hoechstbewertung fuer frankreich infrage stellt im zusammenhang mit den massiven finanzproblemen der franzoesisch belgischen grossbank dexia hatte die ratingagentur anfang des monats noch die stabilitaet frankreichs betont waehrend sie belgien vor einer herabstufung seiner kreditwuerdigkeit gewarnt hatte +Wirtschaft de-DE url-pathpart-print url-pathpart-die url-pathpart-welt url-pathpart-finanzen url-pathpart-article13668414 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Finanzen-Kompakt url-pagepartsplitname-Finanzen url-pagepartsplitname-Kompakt url-lastpartpagename-html kompakt millionen gesetzlich versicherte sparen im kommenden jahr geld +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-der url-pathpart-staat url-pathpart-spekuliert url-pathpart-gegen url-pathpart-seine url-pathpart-buerger url-pathpart-485903 url-domainname-www.wiwo.de url-domainprefix-www url-domainprefix-www.wiwo url-domaintype-de url-domainsuffix-wiwo.de der staat spekuliert gegen seine buerger jeden tag ein neuer vorschlag zur rettung der welt zumindest aber der eu und des euros brot und spiele im alten rom zur befriedung der massen eingesetzt funktioniert auch im neuen deutschland es gibt viel zu verteilen inzwischen vergeht kein tag ohne neue vorschlaege zur rettung der welt vor dem finanziellen kollaps danach bemuehen die medien sich ganz schnell sie zu interpretieren bevor politiker und banker wieder etwas neues vorschlagen erkenntniswert tendenz gegen null worum es wirklich geht hat der philosoph peter sloterdijk schon vor einem jahr so formuliert +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-article13666843 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Die-zweifelhaften-Klima-Bilanzen-deutscher-Firmen url-pagepartsplitname-Die url-pagepartsplitname-zweifelhaften url-pagepartsplitname-Klima url-pagepartsplitname-Bilanzen url-pagepartsplitname-deutscher url-pagepartsplitname-Firmen url-lastpartpagename-html co2 emissionen mehr als 100 deutsche konzerne berichten ueber ihre co 2 emissionen daraus wird ein index fuer investoren erstellt doch der sinn des ganzen ist fraglich es ist ein unglaublicher aufwand +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-carreys-tochter-reicht-scheidung-1463234 url-pagepartsplitname-carreys url-pagepartsplitname-tochter url-pagepartsplitname-reicht url-pagepartsplitname-scheidung url-pagepartsplitname-1463234 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss jim carreys tochter reicht die scheidung ein nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-pathpart-article2063264 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Weniger-Faelle-von-sexuellem-Kindesmissbrauch url-pagepartsplitname-Weniger url-pagepartsplitname-Faelle url-pagepartsplitname-von url-pagepartsplitname-sexuellem url-pagepartsplitname-Kindesmissbrauch url-lastpartpagename-html weniger faelle von sexuellem kindesmissbrauch trotz der juengsten skandale ist die zahl der opfer zurueckgegangen offensichtlich zeigen missbrauchsopfer ihre peiniger haeufiger an berlin anders als die juengsten skandale nahe legen sind faelle von sexuellem kindesmissbrauch in den vergangenen jahren weniger geworden in einer repraesentativen befragung des kriminologischen forschungsinstituts niedersachsen leitung +Gesundheit de-DE url-pathpart-gesundheit url-pathpart-ratgeber url-pathpart-zukunftsmedizin url-pathpart-therapie url-pathpart-tid url-pathpart-9595 url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-ips-der-durchbruch-von-yamanaka_aid_295044 url-pagepartsplitname-ips url-pagepartsplitname-der url-pagepartsplitname-durchbruch url-pagepartsplitname-von url-pagepartsplitname-yamanaka url-pagepartsplitname-aid url-pagepartsplitname-295044 url-lastpartpagename-html risiken nebenwirkung das menschliche erbgut ist wie eine bibliothek die jeder einzelnen koerperzelle zur verfuegung steht doch die meisten koerperzellen lesen nur einzelne kapitel aus dem buch das zu ihrem organ oder gewebe gehoert ihr entwicklungspotenzial ist daher begrenzt biologisch hat das einen sinn denn je laenger eine zelle lebt desto eher haeuft sich fehlerhafter buchstabensalat an damit steigt zum beispiel das risiko fuer krebs embryonale stammzellen hingegen haben noch keine festgelegten lesevorlieben +Gesundheit de-DE url-pathpart-ratgeber url-pathpart-gesundheit url-pathpart-article1977416 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Bio-Bandscheiben-sollen-Rueckenschmerzen-lindern url-pagepartsplitname-Bio url-pagepartsplitname-Bandscheiben url-pagepartsplitname-sollen url-pagepartsplitname-Rueckenschmerzen url-pagepartsplitname-lindern url-lastpartpagename-html bio bandscheiben sollen rueckenschmerzen lindern washington kaputte bandscheiben wollen us forscher in zukunft mit biologischen implantaten ersetzen erste tests mit bio bandscheiben die aus zellen von schafen aufgebaut worden waren seien in versuchen mit ratten erfolgreich verlaufen die implantate ermoeglichten den tieren volle beweglichkeit und hatten sich nach sechs monaten fast wie natuerliche bandscheiben in die wirbelsaeule integriert schreiben die experten in den proceedings der us akademie der wissenschaften probleme mit den bandscheiben sind die hauptursache fuer schmerzen im ruecken und nackenbereich und verursachen enorme kosten schreiben die forscher in den meisten faelle wuerden die beschwerden konservativ mit physiotherapie und medikamenten behandelt auch operationen seien moeglich bei denen kaputte bandscheiben unter anderem durch mechanische implantate ersetzt werden koennen der medizinische nutzen solcher implantate sei unter fachleuten umstritten unabhaengig davon versagten aber viele implantate ihren dienst weil sie sich lockerten verrutschten oder einfach im laufe der zeit abnutzten die forscher um robby bowles von der cornell university in ithaca us staat new york testeten nun rein biologische bandscheiben implantate dazu erstellten sie zunaechst auf grundlage von computertomographie bildern eine art gussmodell der zu ersetzenden bandscheibe dieses modell nutzten sie dann um mit hilfe von zellen aus der bandscheibe von schafen ein implantat aufzubauen bandscheiben bestehen aus einem inneren wasserreichen gallertkern und einem aeusseren faserring diesen aufbau bildeten die wissenschaftler mit hilfe der unterschiedlichen zelltypen nach sie verpflanzten die bio bandscheibe dann in die schwanzwirbelsaeule von ratten nachdem den nagern dort die betreffende bandscheibe entfernt worden war das implantat liess sich gut in die luecke zwischen den wirbeln einpassen berichten die forscher nach sechs monaten stellen sie fest dass die hoehe der bio bandscheibe erhalten gelieben war und diese sich in die umliegenden wirbelkoerper integriert hatte die zellen hatten eine so genannte extrazellulaere matrix gebildet die sich in der biochemischen zusammensetzung von der natuerlicher bandscheiben kaum unterschied schreiben die forscher weiter auch die mechanischen eigenschaften des implantats also etwa die reaktion auf druckbelastungen aehnelten denen natuerlicher bandscheiben bevor solche bio bandscheiben beim menschen getestet werden koennen muessten viele fragen geklaert werden menschliche bandscheiben seien sehr viel groesser und die mechanischen anforderungen an ein implantat anders als im rattenschwanz ausserdem sei unklar wie das ersatzgewebe reagiere wenn es in ein entzuendetes umfeld implantiert werde die bandscheiben der ratten seien vor der implantation gesund gewesen bei patienten die eine neue bandscheibe benoetigten sei das nicht der fall zudem eigneten sich fuer anwendungen beim menschen keine schafszellen zur herstellung des implantats dpa +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,794069,00 url-pagepartsplitname-0,1518,794069,00 url-lastpartpagename-html#ref=rss minutenprotokoll angela merkel reist mit voller rueckendeckung des bundestages zum eu gipfel nach bruessel bei der abstimmung ueber die staerkung des euro rettungsschirms erreichte sie 503 stimmen und die kanzlermehrheit lesen sie im minutenprotokoll nach wie der tag im bundestag lief fuer kanzlerin angela merkel war es ein erfolgreicher tag im bundestag die erste huerde vor ihrer reise zum eu gipfel nach bruessel schaffte sie mit bravour nach einer turbulenten debatte erhielt der entschliessungsantrag der fraktionen von union fdp spd und gruenen 503 stimmen darunter 311 aus der regierungskoalition die kanzlermehrheit alle fraktionen bis auf die linke gaben damit ihre zustimmung fuer verhandlungen der euro laender ueber eine hoehere schlagkraft des rettungsfonds efsf damit ist merkel in bruessel voll verhandlungsfaehig am mittag hatte merkel mit einer leidenschaftlichen rede um unterstuetzung geworben +Wirtschaft de-DE url-pathpart-print url-pathpart-die url-pathpart-welt url-pathpart-finanzen url-pathpart-article13674828 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Dax-mit-Kurssprung-ins-Wochenende url-pagepartsplitname-Dax url-pagepartsplitname-mit url-pagepartsplitname-Kurssprung url-pagepartsplitname-ins url-pagepartsplitname-Wochenende url-lastpartpagename-html dax mit kurssprung ins wochenende die juengsten aussagen der ratingagentur fitch haben dem dax am freitag noch einmal einen kraeftigen schub gegeben der deutsche leitindex stand zum handelsende mit einem plus von 3 6 prozent bei 5970 punkten den vormittag ueber hatte er noch eher moderate gewinne verzeichnet auch mdax und tecdax zogen an der index mittelgrosser werte gewann 2 9 prozent auf 8956 punkte der technologieindex tecdax drehte zurueck ins plus und arbeitete sich um 1 6 prozent auf 690 punkte vor haendler verwiesen auf aussagen der ratingagentur fitch wonach sie keine rating aenderungen wegen des eu gipfels erwartet zudem wuerde eine starke eu loesung italien und spanien vor einer herabstufung schuetzen deutsche staatsanleihen gaben trotz schwindender erwartungen an den eu gipfel zur schuldenkrise nach der richtungweisende bund future sank bis zum abend um 60 ticks auf 134 63 punkte die rendite der zehnjaehrigen bundesanleihe kletterte auf 2109 prozent der kurs des euro erholte sich nach der veroeffentlichung des besser als erwartet ausgefallenen ifo geschaeftsklimas von anfaenglichen verlusten die europaeische gemeinschaftswaehrung wurde am abend mit 1 3882 us dollar gehandelt zuvor war der euro noch bis auf 1 3773 dollar gefallen favoriten im dax waren nach den kraeftigen vortagsverlusten die banktitel an der dax spitze verteuerten sich commerzbank um 7 4 prozent deutsche bank gewannen 6 9 prozent sie hatten zuvor am staerksten unter der unsicherheit mit blick auf den eu gipfel gelitten man verteuerten sich um 3 7 prozent laut einem handelsblatt bericht sollen sich mit vw man und ipic die eigner der essener ferrostaal am wochenende in paris zu einem gespraech ueber die loesung des konflikts treffen man will sich restlos von ferrostaal trennen an dem es noch 30 prozent haelt schwaecher als der markt entwickelten sich merck die von dem verzicht auf eine weiterentwicklung des parkinson medikaments safinamid belastet wurden die aktie gewann 0 9 prozent eine etwas schlechter als erwartet ausgefallene quartalsbilanz des franzoesischen triebwerkbauers safran belastete den deutschen triebwerksherstellers mtu aero engines dessen papiere gaben um 1 1 prozent nach +Unterhaltung de-DE url-pathpart-aktuelles url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-marion-cotillard-oben-ohne-bad-meer-tz-1446425 url-pagepartsplitname-marion url-pagepartsplitname-cotillard url-pagepartsplitname-oben url-pagepartsplitname-ohne url-pagepartsplitname-bad url-pagepartsplitname-meer url-pagepartsplitname-tz url-pagepartsplitname-1446425 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss marion cotillard nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-geldanlage url-pathpart-article13666921 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Neue-Regeln-zur-Honorarberatung-ziehen-sich-hin url-pagepartsplitname-Neue url-pagepartsplitname-Regeln url-pagepartsplitname-zur url-pagepartsplitname-Honorarberatung url-pagepartsplitname-ziehen url-pagepartsplitname-sich url-pagepartsplitname-hin url-lastpartpagename-html verbraucherschutz fruehestens 2012 sollen neue regeln fuer die honorarberatung kommen derzeit wird das eckpunkte papier von ministerin ilse aigner heiss diskutiert seit rund drei jahren setzt sich verbraucherschutzministerin ilse aigner csu mit ihrer qualitaetsoffensive verbraucherfinanzen fuer bessere informationen fuer bank und versicherungskunden ein im zuge dieser nach der lehman pleite gestarteten initiative sind etwa beratungsprotokolle und informationsblaetter eingefuehrt worden dennoch gibt es im anlegerschutz nach wie vor baustellen und man kann sich des eindrucks nicht erwehren dass den kunden nicht immer die besten produkte empfohlen werden sagt gitta connemann cdu in den vom verbraucherministerium im spaetsommer vorgelegten eckpunkten zur honorarberatung sieht sie den fehlenden baustein in einem grossen mosaik connemann ist mitglied des zustaendigen ausschusses fuer verbraucherschutz mit den neuen regeln zur honorarberatung will das verbraucherministerium den interessenskonflikt zwischen der kostenpflichtigen beratung im interesse des kunden und der vermeintlich kostenlosen provisionsgetrieben beratung entschaerfen so soll zum beispiel die steuerliche behandlung von honoraren mit der von provisionen gleichgestellt werden heute koennen beispielsweise bei lebens policen eingerechnete provisionen steuerlich geltend gemacht werden honorare aber nicht zudem sollen sich nur solche fachleute finanzberater nennen duerfen die eine ausreichende qualifikation nachweisen koennen sie muessen darlegen dass sie einen umfassenden marktueberblick haben und somit ueber kredite policen und anlageprodukte gewissenhaft beraten koennen das eckpunkte papier ist laut verbraucherschutzministerium derzeit in den beratungen zum weiteren zeitplan will man nichts sagen ich gehe davon aus dass da in diesem jahr nichts mehr kommt sagt hingegen lars gatschke vom verbraucherzentrale bundesverband vzbv das papier werde derzeit noch heiss diskutiert fuer die branche koennte es grosse veraenderungen mit sich bringen so werden produktanbieter kuenftig gezwungen sein nettotarife anzubieten oder die im produkt enthaltenen abschlusskosten offen zu legen sagt professor hans peter schwintowski der versicherungsexperte lehrt an der berliner humboldt universitaet schwintowski rechnet damit dass die neuen regeln zur honorarberatung den wettbewerbsdruck enorm steigern und kunden von deutlich guenstigeren tarifen profitieren wuerden +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-banken url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-sundp-ratingagentur-knoepft-sich-italienische-banken-vor-wachstumsprognose-gekuerzt_aid_667781 url-pagepartsplitname-sundp url-pagepartsplitname-ratingagentur url-pagepartsplitname-knoepft url-pagepartsplitname-sich url-pagepartsplitname-italienische url-pagepartsplitname-banken url-pagepartsplitname-vor url-pagepartsplitname-wachstumsprognose url-pagepartsplitname-gekuerzt url-pagepartsplitname-aid url-pagepartsplitname-667781 url-lastpartpagename-html s p ratingagentur knoepft sich italienische banken vor wachstumsprognose gekuerzt die lage fuer die italienische wirtschaft wird immer verfahrener +Gesundheit de-DE url-pathpart-gesundheit url-pathpart-ratgeber url-pathpart-gehirn url-pathpart-news url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-psoriasis-und-multiple-sklerose-heilungschancen-fuer-autoimmunkrankheiten_aid_676425 url-pagepartsplitname-psoriasis url-pagepartsplitname-und url-pagepartsplitname-multiple url-pagepartsplitname-sklerose url-pagepartsplitname-heilungschancen url-pagepartsplitname-fuer url-pagepartsplitname-autoimmunkrankheiten url-pagepartsplitname-aid url-pagepartsplitname-676425 url-lastpartpagename-html psoriasis und multiple sklerose heilungschancen fuer autoimmunkrankheiten es war wie verhext +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,791359,00 url-pagepartsplitname-0,1518,791359,00 url-lastpartpagename-html#ref=rss slowakisches nein zum efsf das slowakische nein zum rettungsschirm efsf bringt bruessel in die bredouille die zukunft der euro zone liegt nun in den haenden der opposition in bratislava die europaeischen sozialisten wollen ihren parteifreunden ein ja fuer die zweite abstimmung abringen die zeit draengt berlin eigentlich haette laengst alles im lot sein sollen mit dem ja zum efsf waere die slowakei das 17 und letzte land der euro zone gewesen das gruenes licht fuer den erweiterten rettungsschirm efsf gegeben haette doch nun steckt sand im getriebe was die schwarz gelbe koalition in berlin vor zwei wochen im groessten und wirtschaftlich staerksten land der euro zone zustande brachte scheiterte in der kleinen slowakei in bratislava zerbrach in der nacht die christlich liberale koalition an der abstimmung ueber den efsf die euro kritische partei freiheit und liberalitaet sas verweigerte die zustimmung ministerpraesidentin iveta radicova muss wohl bald bei neuwahlen antreten selbst die vertrauensfrage hatte der resoluten politikerin nichts genuetzt nun blicken radicova und die anderen 16 regierungs und staatschefs der euro zone auf die opposition sie soll helfen die verfahrene lage doch noch zu loesen die gespraeche zwischen der amtierenden regierung und der sozialdemokratischen smer sollen noch am mittwoch beginnen am donnerstag koennte in bratislava die zustimmung zum efsf dann stehen dabei sollen so der plan drei der vier bisherigen regierungsparteien gemeinsam mit den sozialdemokraten fuer den efsf stimmen der fraktionschef der sozialisten im europaparlament martin schulz ist optimistisch er haelt kontakt zu robert fico dem vorsitzenden der slowakischen sozialdemokraten fico habe ihm zugesagt so schulz dem schirm bei der kommenden abstimmung sein ja zu geben die zeit draengt denn am 23 oktober ist der ohnehin schon einmal verschobene eu gipfel angesetzt auf dem die staats und regierungschefs eine reihe weiterer wichtiger entscheidungen zum euro treffen wollen bei dieser gelegenheit will man auch die zustimmung aller mitglieder der euro gruppe zum efsf vorweisen durchkreuzt bratislava den zeitplan steht die eu vor einer tiefen krise abwarten nur keine panik das ist die tonlage der bundeskanzlerin angela merkel gab sich auf ihrer asienreise optimistisch ich bin sehr gewiss dass wir bis zum 23 oktober alle unterschriften aller mitgliedsstaaten unter diesem efsf haben werden sagte die cdu politikerin in ho chi minh stadt ihr vizekanzler philipp roesler sagte +Gesundheit de-DE url-pathpart-~r url-pathpart-rp url-pathpart-online url-pathpart-rss url-pathpart-gesundheit url-pathpart-~3 url-pathpart-ZmonvsoEvHM url-domainname-feeds.rp-online.de url-domainprefix-feeds url-domainprefix-feeds.rp-online url-domaintype-de url-domainsuffix-rp-online.de url-firstpartpagename-story01 url-pagepartsplitname-story01 url-lastpartpagename-htm hormonelle verhuetungsmethoden im ueberblick langfristige verhuetung mit hormonen hormonelle verhuetungsmethoden enthalten das hormon gestagen das den eisprung hemmt und den schleimpfropf der sich am gebaermutterhals befindet verdickt durch diese verdickung ist es fuer die spermien fast unmoeglich in die gebaermutter zu gelangen und eine schwangerschaft auszuloesen fuer frauen die es leid sind taeglich die pille einzunehmen um zu verhueten sind hormonelle methoden deshalb praktisch +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-versicherung url-domainname-www.stern.de url-domainprefix-www url-domainprefix-www.stern url-domaintype-de url-domainsuffix-stern.de url-firstpartpagename-gerichtsurteil-zu-provisionen-weg-frei-fuer-discount-versicherungen-1743503 url-pagepartsplitname-gerichtsurteil url-pagepartsplitname-zu url-pagepartsplitname-provisionen url-pagepartsplitname-weg url-pagepartsplitname-frei url-pagepartsplitname-fuer url-pagepartsplitname-discount url-pagepartsplitname-versicherungen url-pagepartsplitname-1743503 url-lastpartpagename-html#utm_source=standard&utm_medium=rssfeed&utm_campaign=wirtschaft gerichtsurteil zu provisionen dieses gerichtsurteil koennte das gaengige geschaeftsmodell fuer lebensversicherungen ins wanken bringen +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-unternehmen url-domainname-www.spiegel.de url-domainprefix-www url-domainprefix-www.spiegel url-domaintype-de url-domainsuffix-spiegel.de url-firstpartpagename-0,1518,792387,00 url-pagepartsplitname-0,1518,792387,00 url-lastpartpagename-html#ref=rss rohstoffspekulation die verbraucherorganisation foodwatch attackiert die banken +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-pathpart-article13661288 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Machtkampf-um-den-braunen-Thron-der-NPD url-pagepartsplitname-Machtkampf url-pagepartsplitname-um url-pagepartsplitname-den url-pagepartsplitname-braunen url-pagepartsplitname-Thron url-pagepartsplitname-der url-pagepartsplitname-NPD url-lastpartpagename-html parteitag auf dem parteitag der npd wird es zu einer kampfkandidatur um den vorsitz kommen doch der richtungsstreit unter den rechtsextremen ist nur gezeter ueber die fassade frueher wollte er gefuerchtet werden als holger apfel sich 2004 erstmals anschickte bundesweit gehoer zu finden bellte er in die mikros der fraktionschef der gerade in den saechsischen landtag eingezogenen npd redete sich in rage seine stimme ueberschlug sich rasch und seine haut fleckte das war einmal heute will holger apfel ganz nach oben dafuer gibt er sich nun dezent zahlreiche parteifreunde aus vielen landesverbaenden haetten ihn zur kandidatur gedraengt liess er vor drei wochen verbreiten der 40 jaehrige will seinen politischen ziehvater udo voigt auf dem anstehenden bundesparteitag vom chefposten vertreiben wo und wann das halten die parteimanager geheim und der geborene hildesheimer macht auf laessig heute schuetzt er seine stimme leise vor den salti die rechte hand in der hosentasche laechelt er viel serioes will er wirken die npd hort der unverbesserlichen der nsdap juenger und hitler verehrer will er an den erfolg rechtsnationaler parteien rund um deutschland heranfuehren dafuer muesse sie sich als eine zukunftsgewandte nationale partei mit serioeser radikalitaet im politischen spektrum einordnen hat er formuliert serioese radikalitaet bedeutet konstruktiv gegenwartsbezogen und volksnah zu sein unpolitische nostalgiepflege und ziellosen verbalradikalismus will er von der agenda streichen die zukunft der npd sieht er als anti euro partei als magnet konservativer globalisierungsgegner auf dem bundesparteitag wird es daher eine echte kampfkandidatur geben udo voigt seit 15 jahren unangefochten im amt muss den fraktionschef apfel fuerchten zumal der mit udo pastoers fraktionschef der npd im schweriner landtag eine achse geschmiedet hat voigt seinerseits ist bei den westdeutschen landesverbaenden gut vernetzt der 59 jaehrige wird um seinen posten kaempfen pattex udo nennen sie ihn in der partei schon seit langem was es indes auf dem parteitag nicht geben wird ist ein machtkampf der programme apfels serioese radikalitaet ist der spagat eine neonazi partei fuer die oeffentlichkeit in freundlicheren farben zu zeichnen doch der braunton wird weiter durchschimmern inhaltlich trennt voigt und apfel kein blatt voneinander beide haben die partei fuer soziale themen und die neonazistischen kameradschaften geoeffnet beide sind seit ihrer jugend fuer die npd aktiv fuer beide war es nicht die frage ob sie parteichef werden sondern wann eine moderne rechtspartei wird die npd nie apfels buergerliches auftreten ist wunschdenken und kalkulierte fassade eine moderne rechtspartei wird die npd nie ihr fehlt die inhaltliche flexibilitaet populistischer parteien in den nachbarlaendern ihre dehnbarkeit dafuer hat die npd jede menge ideologie ein besuch in ihrer zentrale in berlin koepenick belegt dies mit jedem schritt duester ist es dort die waende haben seit jahren keine neue farbe gesehen die knarzende holztreppe hinauf laesst man die kapuzenpullitraeger hinter sich die im erdgeschoss verschlissene wahlplakate stapeln ab dem ersten stock traegt man hemd und stoffhose mit buegelfalte schliesslich sehen sich die leute von der npd im kampf um sein oder nichtsein des deutschen volkes wir stehen an der spitze dieser erneuerung schreiben sie im programm kein gespraech in der parteizentrale ohne ueberfremdung und verauslaenderung florian stein sitzt am computer in seinem buero gegenueber der kueche das frontfenster ist vergittert der 28 jaehrige ist fuer die mitgliederverwaltung zustaendig er sagt unvermittelt +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-news url-pathpart-konjunktur url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-deutsche-konjunktur-bundesregierung-erwartet-nur-noch-zwergen-wachstum_aid_676479 url-pagepartsplitname-deutsche url-pagepartsplitname-konjunktur url-pagepartsplitname-bundesregierung url-pagepartsplitname-erwartet url-pagepartsplitname-nur url-pagepartsplitname-noch url-pagepartsplitname-zwergen url-pagepartsplitname-wachstum url-pagepartsplitname-aid url-pagepartsplitname-676479 url-lastpartpagename-html deutsche konjunktur bundesregierung erwartet nur noch zwergen wachstum der boom der deutschen wirtschaft geht dem ende zu die regierung rechnet fuer kommendes jahr nur noch mit einem plus von 1 0 prozent bislang hatte sie fast doppelt so viel vorausgesagt doch es gibt auch eine gute nachricht die bundesregierung erwartet angesichts zunehmender weltweiter risiken einen daempfer fuer den aufschwung in deutschland fuer das kommende jahr wird statt bisher 1 8 prozent ein wachstum von 1 0 prozent prognostiziert wie bundeswirtschaftsminister philipp roesler fdp am donnerstag in berlin mitteilte fuer dieses jahr geht die regierung noch von einem staerkeren wachstum von 2 9 prozent aus auch wenn sich das expansionstempo nun wie erwartet verlangsamt habe bleibe deutschland stabilitaetsanker und wachstumsmotor fuer europa erklaerte roesler tragender pfeiler der entwicklung in deutschland werde zusehends die binnennachfrage der export schwaeche sich ab roesler sagte +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-news url-domainname-www.stern.de url-domainprefix-www url-domainprefix-www.stern url-domaintype-de url-domainsuffix-stern.de url-firstpartpagename-eu-gipfel-zur-euro-krise-mittwoch-banken-solls-was-geben-1742231 url-pagepartsplitname-eu url-pagepartsplitname-gipfel url-pagepartsplitname-zur url-pagepartsplitname-euro url-pagepartsplitname-krise url-pagepartsplitname-mittwoch url-pagepartsplitname-banken url-pagepartsplitname-solls url-pagepartsplitname-was url-pagepartsplitname-geben url-pagepartsplitname-1742231 url-lastpartpagename-html#utm_source=standard&utm_medium=rssfeed&utm_campaign=wirtschaft eu gipfel zur euro krise es war eine demonstration angela merkel und nicolas sarkozy traten gemeinsam vor die journalisten in bruessel dicke luft herrschte dort schon deswegen weil die reporter fast zwei stunden lang in erwartung des deutsch franzoesischen duos zusammengepfercht waren die atemnot entlud sich in verbittertem beifall fuer den praesidenten und die kanzlerin die gemeinsam mit den anderen 25 eu regierenden erst um 16 28 das mittagessen beendeten fuer merkel und sarkozy ging es darum zu zeigen dass man keineswegs so zerstritten sei wie nun schon seit tagen berichtet wird ich werde ihnen nicht erlauben zu schreiben dass die sache gescheitert ist sagte sarkozy den journalisten angesichts eines akuten mangels an entscheidungen und auch merkel bat eher um nachsicht +Gesundheit de-DE url-pathpart-ratgeber url-pathpart-article2038045 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Ab-35-regelmaessig-zum-TUeV-eine-Bestandsaufnahme url-pagepartsplitname-Ab url-pagepartsplitname-35 url-pagepartsplitname-regelmaessig url-pagepartsplitname-zum url-pagepartsplitname-TUeV url-pagepartsplitname-eine url-pagepartsplitname-Bestandsaufnahme url-lastpartpagename-html ab 35 regelmaessig zum tuev eine bestandsaufnahme hamburg kerngesunde menschen mitte 30 denken nicht an den arzt sagt stephan hofmeister allgemeinmediziner und hausarzt aus hamburg dabei haben alle gesetzlich versicherten ab dem alter von 35 jahren anspruch zu einer frueherkennungsuntersuchung dem check up 35 und das bevor gesundheitliche beschwerden auftreten alle zwei jahre kann dieser gesundheits tuev wiederholt werden vergleichsweise junge leute kommen aber in der regel nur zu diesem check up wenn sie kinder bekommen oder einen hohen kredit fuer einen hausbau aufnehmen wollen und maenner kommen oft weil ihre partnerin sie schickt berichtet hofmeister aus der praxis zahlen aus niedersachsen zeigen dass nur 26 prozent der berechtigten frauen und 27 prozent der maenner das angebot nutzen in hamburg duerfte die zahl in etwa genauso liegen doch was verbirgt sich hinter diesem check up 35 der zu den sogenannten vorsorge oder frueherkennungsuntersuchungen gehoert die untersuchung ist genau definiert sagt hofmeister stellvertretender vorsitzender der vertreterversammlung der kassenaerztlichen vereinigung hamburg neben einem ausfuehrlichen gespraech ueber vorerkrankungen lebensfuehrung und krankheiten in der familie gehoert eine koerperliche untersuchung mit dem abhorchen von lunge und herz dazu der arzt tastet auch ob der patient vergroesserte lymphknoten eine zu grosse schilddruese oder vergroesserte organe im bauchraum hat auch das gewicht wird ermittelt und der blutdruck gemessen eine blutprobe gehoert ebenfalls dazu zwei werte werden bestimmt +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-lena-geschockt-ihrem-ersten-steuerbescheid-zr-1458344 url-pagepartsplitname-lena url-pagepartsplitname-geschockt url-pagepartsplitname-ihrem url-pagepartsplitname-ersten url-pagepartsplitname-steuerbescheid url-pagepartsplitname-zr url-pagepartsplitname-1458344 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss lena war geschockt von ihrem ersten steuerbescheid lena meyer landrut mag keine behoerdengaenge und beschaeftigt sich nicht gerne mit steuerbescheiden und versicherungen solche pflichtgaenge finde ich unfassbar anstrengend und belastend vor allem weil das sachen sind die konsequenzen haben wenn ich sie nicht erledige sagte die 20 jaehrige gewinnerin des eurovision song contest 2010 dem berliner tagesspiegel als sie ihren ersten steuerbescheid bekommen habe sei sie geschockt gewesen ich habe abzuege von etwa 43 prozent oh doch so viel habe ich gedacht und dann gleich die frist verpasst da musste ich mahngebuehren zahlen +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-news url-pathpart-unternehmen url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-bankbranche-citigroup-trotzt-der-schuldenkrise_aid_675473 url-pagepartsplitname-bankbranche url-pagepartsplitname-citigroup url-pagepartsplitname-trotzt url-pagepartsplitname-der url-pagepartsplitname-schuldenkrise url-pagepartsplitname-aid url-pagepartsplitname-675473 url-lastpartpagename-html bankbranche citigroup trotzt der schuldenkrise der umbau der us bank citigroup nach der beinahe pleite in der finanzkrise macht sich offenbar bezahlt +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-pathpart-tid url-pathpart-23842 url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-nach-brandanschlag-in-berlin-polizei-hat-angst-vor-einer-neuen-raf_aid_673445 url-pagepartsplitname-nach url-pagepartsplitname-brandanschlag url-pagepartsplitname-in url-pagepartsplitname-berlin url-pagepartsplitname-polizei url-pagepartsplitname-hat url-pagepartsplitname-angst url-pagepartsplitname-vor url-pagepartsplitname-einer url-pagepartsplitname-neuen url-pagepartsplitname-raf url-pagepartsplitname-aid url-pagepartsplitname-673445 url-lastpartpagename-html nach brandanschlag in berlin polizei hat angst vor einer neuen raf zu den anschlaegen auf die bahn bekennt sich eine gruppe von linksextremisten bisher ist die gruppe komplett unbekannt doch beide polizei gewerkschaften nehmen das bekennerschreiben zum anlass fuer drastische warnungen nach den brandanschlaegen in berlin waehlt der chef der polizeigewerkschaft rainer wendt einen drastischen historischen vergleich wir erleben eine renaissance der rote armee fraktion sagte er gegenueber bild de es wird neue strukturen geben dezentral sehr viel lockerer mit einzelnen kleine gruppen der linke terror wird sich noch steigern unterstuetzung erhaelt wendt von seinem kollegen der gewerkschaft der polizei bernhard witthaut er fordert mehr personal fuer verfassungsschutz und polizeilichen staatsschutz die politik muesse die warnungen des verfassungsschutzes ernst nehmen auch der raf terror hat mit der verharmlosenden sogenannten gewalt gegen sachen begonnen spaeter wurden menschen ermordet mit brandsaetzen in berlin und brandenburg hatten mutmasslich linksextreme am montag den zugverkehr rund um die hauptstadt stark beeintraechtigt am berliner hauptbahnhof waere es um ein haar zu einem womoeglich folgenschweren anschlag gekommen ein mitarbeiter entdeckte die sieben brandsaetze in einem tunnel bevor sie explodieren konnten auf freier strecke nordwestlich von berlin zuendete jedoch ein brandsatz in einem schacht fuer signalkabel und kappte so die wichtige verbindung nach hamburg tausende reisende und pendler werden deshalb auch am dienstag von verspaetungen und zugausfaellen betroffen sein in einem bekennerschreiben protestiert eine linksextreme gruppe im internet gegen den bundeswehreinsatz in afghanistan das landeskriminalamt lka brandenburg haelt das bekennerschreiben nach einer vorlaeufigen bewertung fuer authentisch bundesverkehrsminister peter ramsauer csu verurteilte die brandanschlaege das leben unbeteiligter pendler familien und kinder zu gefaehrden ist keine meinungsaeusserung sondern ein anschlag auf unsere gesellschaft sagte ramsauer der tageszeitung die welt vom dienstag die verantwortlichen wuerden mit aller haerte verfolgt und zur rechenschaft gezogen verletzt wurde bei den anschlaegen niemand moeglicherweise auch weil die bahn ihr personal nach dem anschlag in brandenburg am fruehen morgen auf zusaetzliche kontrollgaenge schickte an der noerdlichen tunneleinfahrt zum hauptbahnhof entdeckte ein mitarbeiter hinter einem trafo haus am mittag sieben brandsaetze in einem kabelschacht spezialisten machten die behaelter mit brennbaren fluessigkeiten rechtzeitig unschaedlich doch die haetten extrem gefaehrlich sein koennen die sprengsaetze haetten enormen schaden anrichten koennen die tunneleinfahrt am hauptbahnhof liegt gut 200 meter vom bahnhof selbst entfernt der nicht gesperrt wurde +Wirtschaft de-DE url-pathpart-artikel url-domainname-www.neues-deutschland.de url-domainprefix-www url-domainprefix-www.neues-deutschland url-domaintype-de url-domainsuffix-neues-deutschland.de url-firstpartpagename-208813 url-pagepartsplitname-208813 url-lastpartpagename-html freund freund wirtschaftsmacht russlands premier wladimir putin nannte den vorsitzenden der volksrepublik china einen teuren freund und der betonte dass sein gast ein grosser freund des chinesischen volkes sei doch nicht nur um artigkeiten geht es bei dem arbeitsbesuch des ersten mannes russlands in peking zwischen januar und august 2011 war china der wichtigste aussenhandelspartner russlands doch um den usa und europa auf wissenschaftlichem und wirtschaftlichen gebiet ein ernster konkurrent zu sein reicht das und beider reichtum an bodenschaetzen nicht aus man will die wirtschaftliche und wissenschaftliche zusammenarbeit auf eine neue ebene die der schluesseltechnologien heben die vorstellungen ueber gemeinsame projekte in der luft und raumfahrt sowie beim schiffbau sind weit gediehen bislang so putin habe man sich beim kauf neuer zivilflugzeuge immer an us amerikanische und westeuropaeische hersteller gehalten aber russland und china seien durchaus in der lage eigene produktionslinien aufzubauen angesprochen wurde der schon laenger anvisierte bau eines gemeinsamen transportflugzeuges die beiderseits geschaetzte russische il 76 ist dabei ein gutes ausgangsmodell putin hatte in den vergangenen jahren bereits eine neuordnung des russischen flugzeugbaus verlangt auch um aus diesem bereich impulse fuer die entwicklung anderer industriebereiche zu erhalten nun versucht er dieses vorhaben bilateral auszubauen und hat einen ueber russland und china hinausgehenden gigantischen absatzmarkt vor augen was moeglich ist zeigt die zusammenarbeit auf militaerischem gebiet der austausch in diesem jahr liegt derzeit geschaetzt bei rund einer milliarde us dollar als schluessel fuer gemeinsamen fortschritt wird auch der energiebereich betrachtet gestuetzt auf gemeinsame erfahrungen beim bau des chinesischen atomkraftwerks tianwan werde man eine engere kooperation bei der nukleartechnik ermoeglichen auf der grundlage modernster technologien und unter beruecksichtigung der eventuellen risiken die praktisch auf den nullpunkt gebracht werden muessen so putin gleichfalls im programm ist der ausbau von stromerzeugung aus wasserkraft und sogenannten alternativen quellen auch auf dem agrarsektor will man kuenftig enger kooperieren um bestehende handels und investitionshindernisse abzubauen verstaendigten sich beide seiten auf eine aktivere nutzung von yuan und rubel bei den bilateralen verrechnungen +Unterhaltung de-DE url-pathpart-nachrichten url-pathpart-stars url-portnumber-80 url-domainname-www.tz-online.de url-domainprefix-www url-domainprefix-www.tz-online url-domaintype-de url-domainsuffix-tz-online.de url-hasarguments- url-firstpartpagename-nicolas-cage-sachsen-anhalt-kamera-1452360 url-pagepartsplitname-nicolas url-pagepartsplitname-cage url-pagepartsplitname-sachsen url-pagepartsplitname-anhalt url-pagepartsplitname-kamera url-pagepartsplitname-1452360 url-lastpartpagename-html url-argname-cmp url-argvalue-defrss url-argvaluesplit-defrss nicolas cage in sachsen anhalt vor kamera nuernberg so hatte sich monika gruber ihren ersten auftritt bei wetten dass sicher nicht vorgestellt die kabarettistin wurde bei der samstagabend show opfer einer technsichen panne das war passiert +Gesundheit de-DE url-pathpart-allergie url-pathpart-aktuelles url-domainname-www.stern.de url-domainprefix-www url-domainprefix-www.stern url-domaintype-de url-domainsuffix-stern.de url-firstpartpagename-lebensmittelallergie-forscher-tricksen-immunsystem-aus-1738501 url-pagepartsplitname-lebensmittelallergie url-pagepartsplitname-forscher url-pagepartsplitname-tricksen url-pagepartsplitname-immunsystem url-pagepartsplitname-aus url-pagepartsplitname-1738501 url-lastpartpagename-html#utm_source=standard&utm_medium=rssfeed&utm_campaign=gesundheit lebensmittelallergie bei einer nahrungsmittelallergie loesen auch kleinste bestandteile bestimmter lebensmitteln heftige auch lebensgefaehrliche immunreaktionen aus jetzt konnten amerikanische mediziner in versuchen mit maeusen mithilfe eines neuartigen verfahrens solche allergischen reaktionen auf erdnuesse abschalten sie koppelten erdnussproteine an weisse blutkoerperchen die den tieren anschliessend injiziert wurden nach nur zweimaliger behandlung hatte das immunsystem der maeuse gelernt mit der nahrung aufgenommene erdnuesse zu tolerieren wie die forscher an einem anderen beispiel zeigen konnten eignet sich die technik auch zur therapie anderer allergieformen schreiben sie im fachblatt journal of immunology wir glauben dass wir einen weg gefunden haben um die allergischen reaktionen bei nahrungsmittelallergien sicher und schnell abzuschalten sagt paul bryce von der northwestern university feinberg school of medicine in chicago der zusammen mit stephen miller das forscherteam leitete fuer ihre experimente hatten die wissenschaftler maeuse mit einer starken erdnussallergie erzeugt der verzehr der nuesse loeste ueberschiessende immunreaktionen aus die einen oft toedlich verlaufenden anaphylaktischen schock zur folge hatten die forscher entnahmen dem blut der maeuse weisse blutkoerperchen sogenannte leukozyten und hefteten daran erdnussproteine die fuer die allergische reaktion verantwortlich sind die so veraenderten leukozyten wurden den maeusen dann wieder injiziert nach einer erneuten behandlung auf dieselbe weise erhielten die tiere erdnussextrakt mit der nahrung der anaphylaktische schock blieb nun aus das immunsystem tolerierte das erdnussprotein weil das protein zuvor bereits von den weissen blutkoerperchen praesentiert wurde sagt bryce er haelt es fuer moeglich mehr als nur eine art von proteinen an die leukozyten zu koppeln so dass auch therapien gegen mehrfache nahrungsmittelallergien denkbar waeren durch die behandlung so bryce wuerde das gestoerte gleichgewicht zwischen zwei typen von immunzellen den aggressiven th2 t zellen und den daempfenden regulatorischen t zellen wiederhergestellt das verfahren erwies sich auch bei einer anderen form der allergie als wirksam bei der ein protein aus eiern asthmaartige anfaelle ausloest in weiteren experimenten pruefen die forscher nun ob sich die leukozyten durch kuenstliche mikropartikel ersetzen lassen solche partikel als traeger von allergie ausloesenden proteinen liessen sich leichter auf standardisierte weise herstellen +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-pathpart-article2059094 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Medien-Fahnder-planen-Aktion-gegen-Steuerfluechtlinge url-pagepartsplitname-Medien url-pagepartsplitname-Fahnder url-pagepartsplitname-planen url-pagepartsplitname-Aktion url-pagepartsplitname-gegen url-pagepartsplitname-Steuerfluechtlinge url-lastpartpagename-html medien derzeit sollen die daten von 3000 steuersuendern ueberprueft werden es handelt sich dabei um eine cd die das land nrw 2010 gekauft hatte hamburg bochum deutschen steuerfluechtlingen mit schwarzgeld in luxemburg droht aerger denn steuerfahnder und staatsanwaltschaften aus dem gesamten bundesgebiet bereiten angebliche eine gross angelegte aktion gegen mutmassliche steuersuender vor der spiegel berichtete am donnerstag vorab die ermittler untersuchten daten von einer cd aus luxemburg die das land nrw vor einem jahr fuer vier millionen euro angekauft habe wuppertaler steuerfahnder und die schwerpunktstaatsanwaltschaft fuer wirtschaftskriminalitaet in bochum sollen derzeit daten von etwa 3 000 steuersuendern pruefen in den fokus der ermittler seien kunden einer tochter der britischen grossbank hsbc in luxemburg geraten die bochumer staatsanwaltschaft gab auf nachfrage keine stellungnahme ab die financial times deutschland bezifferte den kaufpreis fuer die cd auf knapp drei millionen euro dafuer enthalte der datentraeger genaue informationen ueber schwarzkonten in luxemburg die eindeutig deutschen hsbc kunden zuzuordnen seien dem bericht zufolge ist fuer november eine razzia geplant die zeitung berichtete unter berufung auf behoerdenkreise der datensatz sei ausserordentlich gross das material erstklassig dadurch dass die ermittlungen im gang seien haetten die steuersuender auch keine chance mehr der strafe durch eine selbstanzeige zu entgehen es wird erwartet dass die cd etliche millionen euro in die staatskasse spuelt dapd +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-news url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-anti-banken-proteste-polizei-raeumt-zeltlager-von-demonstranten_aid_678189 url-pagepartsplitname-anti url-pagepartsplitname-banken url-pagepartsplitname-proteste url-pagepartsplitname-polizei url-pagepartsplitname-raeumt url-pagepartsplitname-zeltlager url-pagepartsplitname-von url-pagepartsplitname-demonstranten url-pagepartsplitname-aid url-pagepartsplitname-678189 url-lastpartpagename-html anti banken proteste polizei raeumt zeltlager von demonstranten zeugen beklagen ein skandaloeses vorgehen der kalifornischen polizei in der innenstadt von oakland der bewegung occupy wall street zufolge sind bei der raeumung eines zeltlagers 70 menschen festgenommen worden bei den anti banken protesten im kalifornischen oakland sind am dienstag dutzende menschen festgenommen worden nach us medienberichten liess die polizei ein zeltlager in der innenstadt raeumen die demonstranten die mehr als zwei wochen vor dem rathaus campierten hatten ein ultimatum zur raeumung des platzes ignoriert die gruppe occupy oakland sprach in einer twitter mitteilung am dienstag von 70 festnahmen ich zelte nicht ich demonstriere was hier heute passiert ist skandaloes sagte die protestlerin mindy stone nach ihrer festnahme dem san francisco chronicle die stadtverwaltung hatte sich zuvor ueber sanitaere probleme angeblichen drogenmissbrauch und ausschreitungen in dem zeltlager beschwert die bewegung occupy wall street besetzt die wall street hatte im september in new york ihren ausgang genommen und sich auf andere staedte ausgeweitet die menschen wollen die macht der banken brechen verlangen hoehere steuern fuer reiche und verbesserungen im sozialsystem +Unterhaltung de-DE url-pathpart-kultur url-pathpart-kunst url-pathpart-2011 url-pathpart-10 url-domainname-www.zeit.de url-domainprefix-www url-domainprefix-www.zeit url-domaintype-de url-domainsuffix-zeit.de url-firstpartpagename-lehrjahre-eines-dissidenten url-pagepartsplitname-lehrjahre url-pagepartsplitname-eines url-pagepartsplitname-dissidenten ai weiwei ausstellung der berliner martin gropius bau zeigt fotografien des chinesischen kuenstlers ai weiwei aus dessen zeit in new york sie dokumentieren die anfaenge einer grossen karriere er ist frei und ist es nicht seit ai weiwei chinas bekanntester kuenstler am 22 juni aus 80 taegiger haft entlassen wurde darf er sich nicht politisch aeussern darf peking nicht verlassen und schon gar nicht ins ausland reisen der fall ai weiwei ist neben dem inhaftierten friedensnobelpreistraeger liu xiaobo das aktuell gravierendste beispiel der drangsalierung von chinesischen dissidenten museen koennen daran kaum etwas aendern aber sie koennen erinnern und mahnen indem sie ais werke zeigen die nach ihrer premiere in peking 2009 nun aus new york in den gropius bau wandernde ausstellung mit ueber 220 fotografien aus ai weiweis new yorker zeit von 1983 bis 1993 hat der kuenstler selbst kuratiert und aus 10 000 fotos eine auswahl getroffen der bei seiner ankunft 25 jaehrige bewohnte eine winzige bleibe im east village er wollte eigentlich studieren trieb sich aber auf der strasse und in der avantgarde szene herum er schrieb gleichsam tagebuch mit der kamera fing schnappschuesse vom aufbruch ein dokumentierte die eigenen kuenstlerischen anfaenge in echtzeit +Deutschland de-DE url-pathpart-politik url-pathpart-article2052122 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Buschkowsky-Halte-Entwicklung-fuer-komfortabler url-pagepartsplitname-Buschkowsky url-pagepartsplitname-Halte url-pagepartsplitname-Entwicklung url-pagepartsplitname-fuer url-pagepartsplitname-komfortabler url-lastpartpagename-html buschkowsky berlin der neukoellner bezirksbuergermeister heinz buschkowsky spd hat die angekuendigten koalitionsgespraeche seiner partei mit der cdu begruesst zur begruendung sagte der spd politiker am freitag im rbb sender radioeins mit der union gebe es nicht wie mit den gruenen von vornherein konfliktpunkte wie die verlaengerung der a 100 er mache keinen hehl daraus dass er die entwicklung jetzt fuer komfortabler halte auch mit blick auf die politische mehrheit ueber einen zeitraum von fuenf jahren mit der cdu sieht er in weiteren wesentlichen punkten wie der s bahn problematik und der erweiterung des flughafens berlin brandenburg uebereinstimmungen unterschiedliche sichtweisen koennte es nach buschkowskys einschaetzung in der schulpolitik beim rueckkauf der wasserbetriebe und bei der frage geben wie viele staedtische wohnungen berlin sich zulegen sollte um mietsteigerungen zu verhindern berlins spd und cdu beginnen kommenden mittwoch koalitionsgespraeche spd und cdu in berlin wollen naechsten mittwoch mit ihren verhandlungen zur bildung einer grossen koalition beginnen der cdu fraktions und landesparteivorsitzende frank henkel sagte am donnerstag in der rbb abendschau dies habe er mit seinem spd kollegen michael mueller und dem regierenden buergermeister klaus wowereit spd am abend bei einem treffen im abgeordnetenhaus vereinbart wowereit spd sieht in der grossen koalition kein hindernis fuer ein rot gruenes buendnis auf bundesebene nach der bundestagswahl 2013 die koalitionsverhandlungen zwischen spd und gruenen waren am mittwoch im streit ueber die verlaengerung der autobahn 100 gescheitert die cdu die das projekt mittraegt war aus der abgeordnetenhauswahl mitte september nach der spd als zweitstaerkste kraft vor den gruenen hervorgegangen am mittwochabend bot der spd vorstand der cdu koalitionsverhandlungen an henkel sagte das gespraech mit mueller und wowereit am donnerstag sei sehr angenehm und konstruktiv verlaufen es gibt ein klima der vertrauensvollen zusammenarbeit sagte er unter anderem haben man einen zeitplan fuer die koalitionsverhandlungen und die bildung von arbeitsgruppen vereinbart auf moegliche strittige punkte wollte henkel nicht eingehen er betonte lediglich +Wirtschaft de-DE url-pathpart-wirtschaft url-pathpart-news url-domainname-www.stern.de url-domainprefix-www url-domainprefix-www.stern url-domaintype-de url-domainsuffix-stern.de url-firstpartpagename-euro-treffen-in-bruessel-gipfel-erlaesst-griechenland-haelfte-der-schulden-1743959 url-pagepartsplitname-euro url-pagepartsplitname-treffen url-pagepartsplitname-in url-pagepartsplitname-bruessel url-pagepartsplitname-gipfel url-pagepartsplitname-erlaesst url-pagepartsplitname-griechenland url-pagepartsplitname-haelfte url-pagepartsplitname-der url-pagepartsplitname-schulden url-pagepartsplitname-1743959 url-lastpartpagename-html#utm_source=standard&utm_medium=rssfeed&utm_campaign=wirtschaft euro treffen in bruessel auf dem weg zum gipfel +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-pathpart-2011 url-pathpart-10 url-domainname-www.zeit.de url-domainprefix-www url-domainprefix-www.zeit url-domaintype-de url-domainsuffix-zeit.de url-firstpartpagename-trojaner-software-diskussion url-pagepartsplitname-trojaner url-pagepartsplitname-software url-pagepartsplitname-diskussion ueberwachungssoftware fuenf faelle aus bayern sind bisher bekannt das vorgehen war immer gleich die ermittler wollten skype gespraeche von verdaechtigen abhoeren sie setzten dafuer aber eine ueberwachungssoftware ein die deutlich mehr kann +Wirtschaft de-DE url-pathpart-finanzen url-pathpart-news url-pathpart-unternehmen url-domainname-www.focus.de url-domainprefix-www url-domainprefix-www.focus url-domaintype-de url-domainsuffix-focus.de url-firstpartpagename-finanzbranche-american-express-profitiert-vom-plastikgeld-boom_aid_676384 url-pagepartsplitname-finanzbranche url-pagepartsplitname-american url-pagepartsplitname-express url-pagepartsplitname-profitiert url-pagepartsplitname-vom url-pagepartsplitname-plastikgeld url-pagepartsplitname-boom url-pagepartsplitname-aid url-pagepartsplitname-676384 url-lastpartpagename-html finanzbranche american express profitiert vom plastikgeld boom die steigende beliebtheit der kreditkarte spuelt dem anbieter american express viel geld in die kasse im dritten quartal stieg der gewinn im vergleich zum vorjahr um 13 prozent auf mehr als eine milliarde dollar american express erzielte im dritten geschaeftsquartal einen gewinn von 1 2 milliarden dollar 873 millionen euro die nutzer haben in diesem quartal viel mit der kreditkarte gezahlt sagte firmenchef kenneth chenault am mittwochabend ortszeit in new york er sprach von einem rekordniveau zudem kam dem finanzkonzern zugute dass die nutzer ihre ausstehenden rechnungen zuverlaessiger bezahlen fuer ausfaelle musste american express noch 249 millionen dollar zuruecklegen nach 373 millionen dollar vor jahresfrist saeumige schuldner hatten sich waehrend der wirtschaftskrise zu einem riesenproblem fuer american express entwickelt anders als die wettbewerber visa und mastercard muss das unternehmen platzende rechnungen selbst verkraften bei den zwei rivalen liegt das risiko bei den partnern welche die karten ausgeben und kredit gewaehren das sind zumeist banken aber auch fluggesellschaften oder autoverleiher trotz der geschaeftszahlen die ueber den analystenerwartungen lagen fiel die aktie in den usa nachboerslich leicht unter den anlegern herrscht die sorge dass die zahlungsausfaelle mit fortschreiten der euro schuldenkrise wieder ansteigen koennten und sich die konsumenten mit einkaeufen wieder mehr zurueckhalten zuletzt waren die ertraege die einnahmen von american express um neun prozent auf 7 6 milliarden dollar gestiegen auch visa und mastercard hatten zuletzt ueber gute geschaefte berichtet ihre aktuellen zwischenbilanzen stehen noch aus alle drei anbieter stellen ihr geschaeft derzeit auf eine breitere basis american express hat fuer eine halbe milliarde euro das bonusprogramm payback uebernommen bei payback koennen verbraucher in teilnehmenden geschaeften und onlineshops punkte fuer jeden einkauf sammeln und diese dann in praemien eintauschen +Wirtschaft de-DE url-pathpart-print url-pathpart-wams url-pathpart-finanzen url-pathpart-article13675924 url-domainname-www.welt.de url-domainprefix-www url-domainprefix-www.welt url-domaintype-de url-domainsuffix-welt.de url-firstpartpagename-Wie-Frauen-wirklich-ticken url-pagepartsplitname-Wie url-pagepartsplitname-Frauen url-pagepartsplitname-wirklich url-pagepartsplitname-ticken url-lastpartpagename-html wie frauen wirklich ticken auch bei der geldanlage geht es zum betraechtlichen teil um psychologie die journalistin anja kuehner hat an einem finanzseminar fuer frauen teilgenommen und ihre beobachtungen in eine charakteristik weiblicher ratsuchender in sachen geldanlage umgesetzt muessen frauen sich bei der beratung zwischen mehreren produkten entscheiden verlaeuft dieser prozess eher schleifenartig sie sind in der lage neue erfahrungen in ihre ueberlegungen einzubauen und bereits getroffene entscheidungen auch wieder zu revidieren ganz anders die maenner +Deutschland de-DE url-pathpart-politik url-pathpart-deutschland url-pathpart-article2072382 url-domainname-www.abendblatt.de url-domainprefix-www url-domainprefix-www.abendblatt url-domaintype-de url-domainsuffix-abendblatt.de url-firstpartpagename-Klares-Ja-zum-Hebel-Merkel-erreicht-Kanzlermehrheit url-pagepartsplitname-Klares url-pagepartsplitname-Ja url-pagepartsplitname-zum url-pagepartsplitname-Hebel url-pagepartsplitname-Merkel url-pagepartsplitname-erreicht url-pagepartsplitname-Kanzlermehrheit url-lastpartpagename-html klares ja zum hebel merkel erreicht kanzlermehrheit 503 von 596 abgeordneten haben fuer den antrag von union fdp spd und gruenen zur ausweitung des euro rettungsschirms gestimmt berlin der bundestag hat sein einverstaendnis zu einer moeglichen ausweitung des euro rettungsschirms efsf gegeben die mehrheit der abgeordneten stimmte am mittwoch fuer einen gemeinsamen antrag von union fdp spd und gruenen der den hebel mechanismus grundsaetzlich freigibt das parlament gab kanzlerin angela merkel damit ein verhandlungsmandat fuer den eu gipfel am abend in bruessel merkel wertete die ausweitung der efsf kapazitaet in ihrer regierungserklaerung als alternativlos die opposition warf schwarz gelb einen chaotischen kurs in der euro politik vor fuer den fraktionsuebergreifenden antrag stimmten 503 von 596 anwesenden abgeordneten 89 parlamentarierer votierten dagegen vier enthielten sich regierungschefin angela merkel cdu hat sogar die kanzlermehrheit erreicht From a07510e82f8faa3cdd7a001d09ce50093aed76bc Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Mon, 18 Jun 2018 16:58:40 -0700 Subject: [PATCH 02/20] switch simple things from subcomponents to factory --- .../EntryPoints/CreateEnsemble.cs | 14 ++++---- .../EntryPoints/DiversityMeasure.cs | 32 +++++++++++++++++++ .../EntryPoints/FeatureSelector.cs | 18 +++++++++++ .../EntryPoints/OutputCombiner.cs | 27 ++++++++++++++++ .../EntryPoints/SubModelSelector.cs | 25 +++++++++++++++ .../OutputCombiners/Average.cs | 5 +-- .../OutputCombiners/BaseMultiAverager.cs | 2 +- .../OutputCombiners/BaseMultiCombiner.cs | 4 +-- .../OutputCombiners/MultiAverage.cs | 11 +++++-- .../OutputCombiners/MultiMedian.cs | 9 ++++++ .../OutputCombiners/MultiStacking.cs | 5 ++- .../OutputCombiners/MultiVoting.cs | 12 +++++-- .../OutputCombiners/MultiWeightedAverage.cs | 6 +++- .../OutputCombiners/RegressionStacking.cs | 11 +++++-- .../OutputCombiners/Stacking.cs | 8 +++-- .../OutputCombiners/Voting.cs | 3 +- .../OutputCombiners/WeightedAverage.cs | 6 +++- .../FeatureSelector/AllFeatureSelector.cs | 1 + .../FeatureSelector/RandomFeatureSelector.cs | 6 +++- .../Selector/IDiversityMeasure.cs | 5 +++ .../Selector/IFeatureSelector.cs | 4 +++ .../Selector/ISubModelSelector.cs | 11 +++++++ .../Selector/ISubsetSelector.cs | 5 +++ .../SubModelSelector/BaseDiverseSelector.cs | 14 ++++---- .../BestDiverseSelectorBinary.cs | 9 ++++++ .../BestDiverseSelectorMultiClass.cs | 7 ++++ .../BestDiverseSelectorRegression.cs | 7 ++++ .../BestPerformanceRegressionSelector.cs | 6 +++- .../BestPerformanceSelector.cs | 6 +++- .../BestPerformanceSelectorMultiClass.cs | 6 +++- .../SubModelSelector/SubModelDataSelector.cs | 11 +++---- .../SubsetSelector/AllInstanceSelector.cs | 5 ++- .../SubsetSelector/BootstrapSelector.cs | 7 ++-- .../SubsetSelector/RandomPartitionSelector.cs | 5 ++- .../Trainer/Binary/EnsembleTrainer.cs | 7 ++-- .../Trainer/EnsembleTrainerBase.cs | 17 +++++----- .../MulticlassDataPartitionEnsembleTrainer.cs | 9 +++--- .../Regression/RegressionEnsembleTrainer.cs | 7 ++-- 38 files changed, 287 insertions(+), 66 deletions(-) create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs index ee7cb8fcac..96750a5406 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs @@ -7,6 +7,7 @@ using System.IO; using System.IO.Compression; using System.Linq; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; @@ -14,7 +15,6 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Utilities; -using Microsoft.ML.Runtime.Learners; [assembly: LoadableClass(typeof(void), typeof(EnsembleCreator), null, typeof(SignatureEntryPointModule), "CreateEnsemble")] @@ -155,13 +155,13 @@ public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHos switch (input.ModelCombiner) { case ClassifierCombiner.Median: - args.OutputCombiner = new SubComponent("Median"); + args.OutputCombiner = new MedianFactory(); break; case ClassifierCombiner.Average: - args.OutputCombiner = new SubComponent("Average"); + args.OutputCombiner = new AverageFactory(); break; case ClassifierCombiner.Vote: - args.OutputCombiner = new SubComponent("Voting"); + args.OutputCombiner = new VotingFactory(); break; default: throw host.Except("Unknown combiner kind"); @@ -191,10 +191,10 @@ public static CommonOutputs.RegressionOutput CreateRegressionEnsemble(IHostEnvir switch (input.ModelCombiner) { case ScoreCombiner.Median: - args.OutputCombiner = new SubComponent("Median"); + args.OutputCombiner = new MedianFactory(); break; case ScoreCombiner.Average: - args.OutputCombiner = new SubComponent("Average"); + args.OutputCombiner = new AverageFactory(); break; default: throw host.Except("Unknown combiner kind"); @@ -279,7 +279,7 @@ public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassPipel combiner = new MultiAverage(host, new MultiAverage.Arguments() { Normalize = true }); break; case ClassifierCombiner.Vote: - combiner = new MultiVoting(host); + combiner = new MultiVoting(host, new MultiVoting.Arguments()); break; default: throw host.Except("Unknown combiner kind"); diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs b/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs new file mode 100644 index 0000000000..9d605b9fb7 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs @@ -0,0 +1,32 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.EntryPoints; + +namespace Microsoft.ML.Ensemble.EntryPoints +{ + [TlcModule.Component(Name = DisagreementDiversityMeasure.LoadName, FriendlyName = DisagreementDiversityMeasure.UserName)] + public sealed class DisagreementDiversityFactory : ISupportDiversityMeasureFactory + { + public IDiversityMeasure CreateComponent(IHostEnvironment env) => new DisagreementDiversityMeasure(); + } + + [TlcModule.Component(Name = RegressionDisagreementDiversityMeasure.LoadName, FriendlyName = DisagreementDiversityMeasure.UserName)] + public sealed class RegressionDisagreementDiversityFactory : ISupportDiversityMeasureFactory + { + public IDiversityMeasure CreateComponent(IHostEnvironment env) => new RegressionDisagreementDiversityMeasure(); + } + + [TlcModule.Component(Name = MultiDisagreementDiversityMeasure.LoadName, FriendlyName = DisagreementDiversityMeasure.UserName)] + public sealed class MultinDisagreementDiversityFactory : ISupportDiversityMeasureFactory> + { + public IDiversityMeasure> CreateComponent(IHostEnvironment env) => new MultiDisagreementDiversityMeasure(); + } + +} diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs b/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs new file mode 100644 index 0000000000..bc70fdb649 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs @@ -0,0 +1,18 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; +using Microsoft.ML.Runtime.EntryPoints; + +namespace Microsoft.ML.Ensemble.EntryPoints +{ + [TlcModule.Component(Name = AllFeatureSelector.LoadName, FriendlyName = AllFeatureSelector.UserName)] + public sealed class AllFeatureSelectorFactory : ISupportFeatureSelectorFactory + { + IFeatureSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllFeatureSelector(env); + } + +} diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs new file mode 100644 index 0000000000..0f2ede3d66 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs @@ -0,0 +1,27 @@ +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Ensemble.EntryPoints +{ + [TlcModule.Component(Name = Average.LoadName, FriendlyName = Average.UserName)] + public sealed class AverageFactory : ISupportOutputCombinerFactory + { + IOutputCombiner IComponentFactory>.CreateComponent(IHostEnvironment env) => new Average(env); + } + + [TlcModule.Component(Name = Median.LoadName, FriendlyName = Median.UserName)] + public sealed class MedianFactory : ISupportOutputCombinerFactory + { + IOutputCombiner IComponentFactory>.CreateComponent(IHostEnvironment env) => new Median(env); + } + + [TlcModule.Component(Name = Voting.LoadName, FriendlyName = Voting.UserName)] + public sealed class VotingFactory : ISupportOutputCombinerFactory + { + IOutputCombiner IComponentFactory>.CreateComponent(IHostEnvironment env) => new Voting(env); + } +} diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs new file mode 100644 index 0000000000..57e397c789 --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs @@ -0,0 +1,25 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.Ensemble.Selector; +using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; + +namespace Microsoft.ML.Ensemble.EntryPoints +{ + [TlcModule.Component(Name = AllSelector.LoadName, FriendlyName = AllSelector.UserName)] + public sealed class AllSelectorFactory : ISupportSubModelSelectorFactory + { + public ISubModelSelector CreateComponent(IHostEnvironment env) => new AllSelector(env); + } + + [TlcModule.Component(Name = AllSelectorMultiClass.LoadName, FriendlyName = AllSelectorMultiClass.UserName)] + public sealed class AllSelectorMultiClassFactory : ISupportSubModelSelectorFactory> + { + public ISubModelSelector> CreateComponent(IHostEnvironment env) => new AllSelectorMultiClass(env); + } +} diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs index 0992029cad..a4d732fe2d 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs @@ -7,13 +7,14 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Model; -[assembly: LoadableClass(typeof(Average), null, typeof(SignatureCombiner), Average.LoadName)] -[assembly: LoadableClass(typeof(Average), null, typeof(SignatureLoadModel), Average.LoadName, Average.LoaderSignature)] +[assembly: LoadableClass(typeof(Average), null, typeof(SignatureCombiner), Average.UserName)] +[assembly: LoadableClass(typeof(Average), null, typeof(SignatureLoadModel), Average.UserName, Average.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { public sealed class Average : BaseAverager, ICanSaveModel, IRegressionOutputCombiner { + public const string UserName = "Average"; public const string LoadName = "Average"; public const string LoaderSignature = "AverageCombiner"; diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs index d8e37ebcb4..25377cb30f 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs @@ -12,7 +12,7 @@ namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { public abstract class BaseMultiAverager : BaseMultiCombiner { - internal BaseMultiAverager(IHostEnvironment env, string name, Arguments args) + internal BaseMultiAverager(IHostEnvironment env, string name, ArgumentsBase args) : base(env, name, args) { } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs index 9a345b3ebe..8b21458280 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs @@ -15,7 +15,7 @@ public abstract class BaseMultiCombiner : IOutputCombiner> { protected readonly IHost Host; - public class Arguments + public abstract class ArgumentsBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to normalize the output of base models before combining them", ShortName = "norm", SortOrder = 50)] @@ -24,7 +24,7 @@ public class Arguments protected readonly bool Normalize; - internal BaseMultiCombiner(IHostEnvironment env, string name, Arguments args) + internal BaseMultiCombiner(IHostEnvironment env, string name, ArgumentsBase args) { Contracts.AssertValue(env); env.AssertNonWhiteSpace(name); diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs index 73c10e83fd..f1cde7111f 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs @@ -6,12 +6,13 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Model; [assembly: LoadableClass(typeof(MultiAverage), typeof(MultiAverage.Arguments), typeof(SignatureCombiner), - Average.LoadName, MultiAverage.LoadName)] + Average.UserName, MultiAverage.LoadName)] [assembly: LoadableClass(typeof(MultiAverage), null, typeof(SignatureLoadModel), - Average.LoadName, MultiAverage.LoadName, MultiAverage.LoaderSignature)] + Average.UserName, MultiAverage.LoadName, MultiAverage.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { @@ -30,6 +31,12 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } + [TlcModule.Component(Name = LoadName, FriendlyName = Average.UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + { + public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiAverage(env, this); + } + public MultiAverage(IHostEnvironment env, Arguments args) : base(env, LoaderSignature, args) { diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs index 0e94315d21..3360ea0378 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs @@ -6,6 +6,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; @@ -33,6 +34,12 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } + [TlcModule.Component(Name = LoadName, FriendlyName = Median.UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + { + public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiMedian(env, this); + } + public MultiMedian(IHostEnvironment env, Arguments args) : base(env, LoaderSignature, args) { @@ -91,5 +98,7 @@ public override Combiner> GetCombiner() dst = new VBuffer(len, values, dst.Indices); }; } + + } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs index 845eab0500..7aa2f2c059 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs @@ -7,6 +7,7 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; @@ -35,8 +36,10 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - public class Arguments : ArgumentsBase + [TlcModule.Component(Name = LoadName, FriendlyName = Stacking.UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> { + public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiStacking(env, this); public Arguments() { // REVIEW tfinley: Kinda stupid. Perhaps we can have a better non-parametetric learner. diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs index 802a756f5c..3d6fcc2130 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs @@ -6,6 +6,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; @@ -32,8 +33,15 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - public MultiVoting(IHostEnvironment env) - : base(env, LoaderSignature, new Arguments() { Normalize = false }) + [TlcModule.Component(Name = LoadName, FriendlyName = Voting.UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + { + public new bool Normalize = false; + public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiVoting(env, this); + } + + public MultiVoting(IHostEnvironment env, Arguments args) + : base(env, LoaderSignature, args) { Host.Assert(!Normalize); } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs index 8d2439ec7a..a24099577a 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs @@ -7,6 +7,7 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; using Microsoft.ML.Runtime.Model; @@ -46,11 +47,14 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - public new class Arguments : BaseMultiCombiner.Arguments + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the weights for each model", ShortName = "wn", SortOrder = 50)] [TGUI(Label = "Metric Name", Description = "The weights are calculated according to the selected metric")] public MultiWeightageKind WeightageName = MultiWeightageKind.AccuracyMicroAvg; + + public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiWeightedAverage(env, this); } private readonly MultiWeightageKind _weightageKind; diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs index 31c43970bc..812b8af2eb 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs @@ -6,18 +6,20 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Model; [assembly: LoadableClass(typeof(RegressionStacking), typeof(RegressionStacking.Arguments), typeof(SignatureCombiner), - Stacking.UserName, RegressionStacking.LoaderSignature)] + Stacking.UserName, RegressionStacking.LoadName)] [assembly: LoadableClass(typeof(RegressionStacking), null, typeof(SignatureLoadModel), - Stacking.UserName, RegressionStacking.LoaderSignature)] + Stacking.UserName, RegressionStacking.LoadName)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { using TScalarPredictor = IPredictorProducing; public sealed class RegressionStacking : BaseScalarStacking, IRegressionOutputCombiner, ICanSaveModel { + public const string LoadName = "RegressionStacking"; public const string LoaderSignature = "RegressionStacking"; private static VersionInfo GetVersionInfo() @@ -30,12 +32,15 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - public class Arguments : ArgumentsBase + [TlcModule.Component(Name = LoadName, FriendlyName = Stacking.UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory { public Arguments() { BasePredictorType = new SubComponent, SignatureRegressorTrainer>("FastTreeRegression"); } + + public IOutputCombiner CreateComponent(IHostEnvironment env) => new RegressionStacking(env, this); } public RegressionStacking(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs index af582e98e9..cf7a731177 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs @@ -7,9 +7,10 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Model; -[assembly: LoadableClass(typeof(Stacking), typeof(Stacking.Arguments), typeof(SignatureCombiner),Stacking.UserName, Stacking.LoadName)] +[assembly: LoadableClass(typeof(Stacking), typeof(Stacking.Arguments), typeof(SignatureCombiner), Stacking.UserName, Stacking.LoadName)] [assembly: LoadableClass(typeof(Stacking), null, typeof(SignatureLoadModel), Stacking.UserName, Stacking.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners @@ -31,12 +32,15 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - public class Arguments : ArgumentsBase + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory { public Arguments() { BasePredictorType = new SubComponent, SignatureBinaryClassifierTrainer>("FastTreeBinaryClassification"); } + + public IOutputCombiner CreateComponent(IHostEnvironment env) => new Stacking(env, this); } public Stacking(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs index 2b90e6a638..932f99d93a 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Voting.cs @@ -8,7 +8,7 @@ using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; -[assembly: LoadableClass(typeof(Voting), null, typeof(SignatureCombiner), Voting.UserName, Voting.UserName)] +[assembly: LoadableClass(typeof(Voting), null, typeof(SignatureCombiner), Voting.UserName, Voting.LoadName)] [assembly: LoadableClass(typeof(Voting), null, typeof(SignatureLoadModel), Voting.UserName, Voting.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners @@ -17,6 +17,7 @@ public sealed class Voting : IBinaryOutputCombiner, ICanSaveModel { private readonly IHost _host; public const string UserName = "Voting"; + public const string LoadName = "Voting"; public const string LoaderSignature = "VotingCombiner"; private static VersionInfo GetVersionInfo() diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs index caa582473b..a6449febcc 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs @@ -7,6 +7,7 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; using Microsoft.ML.Runtime.Model; @@ -34,11 +35,14 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - public class Arguments + [TlcModule.Component(Name = LoadName, FriendlyName = Stacking.UserName)] + public sealed class Arguments: ISupportOutputCombinerFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the weights for each model", ShortName = "wn", SortOrder = 50)] [TGUI(Label = "Weightage Name", Description = "The weights are calculated according to the selected metric")] public WeightageKind WeightageName = WeightageKind.Auc; + + public IOutputCombiner CreateComponent(IHostEnvironment env) => new WeightedAverage(env, this); } private WeightageKind _weightageKind; diff --git a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs index be532901a6..84a70a6ee4 100644 --- a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/AllFeatureSelector.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. + using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; diff --git a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs index d239d8a23e..6e709388a7 100644 --- a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs @@ -9,6 +9,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Training; [assembly: LoadableClass(typeof(RandomFeatureSelector), typeof(RandomFeatureSelector.Arguments), @@ -21,10 +22,13 @@ public class RandomFeatureSelector : IFeatureSelector public const string UserName = "Random Feature Selector"; public const string LoadName = "RandomFeatureSelector"; - public class Arguments + [TlcModule.Component(Name = RandomFeatureSelector.LoadName, FriendlyName = RandomFeatureSelector.UserName)] + public sealed class Arguments: ISupportFeatureSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of features to be selected. The range is 0.0-1.0", ShortName = "fp", SortOrder = 50)] public Single FeaturesSelectionProportion = 0.8f; + + public IFeatureSelector CreateComponent(IHostEnvironment env) => new RandomFeatureSelector(env, this); } private readonly Arguments _args; diff --git a/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs index af7ee1600a..0319a325c9 100644 --- a/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs +++ b/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs @@ -5,6 +5,7 @@ using System.Collections.Concurrent; using System.Collections.Generic; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; +using Microsoft.ML.Runtime.EntryPoints; namespace Microsoft.ML.Runtime.Ensemble.Selector { @@ -15,4 +16,8 @@ List> CalculateDiversityMeasure(IList : IComponentFactory> + { + } } diff --git a/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs index 1a046fe0f4..a854b50ba9 100644 --- a/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; namespace Microsoft.ML.Runtime.Ensemble.Selector { @@ -12,4 +13,7 @@ public interface IFeatureSelector } public delegate void SignatureEnsembleFeatureSelector(); + public interface ISupportFeatureSelectorFactory : IComponentFactory + { + } } diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs index f80a2d4290..2e99a5ffdf 100644 --- a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs @@ -2,6 +2,8 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; using System; using System.Collections.Generic; @@ -25,5 +27,14 @@ public interface IBinarySubModelSelector : ISubModelSelector { } + public interface IMulticlassSubModelSelector : ISubModelSelector> + { + } + public delegate void SignatureEnsembleSubModelSelector(); + + public interface ISupportSubModelSelectorFactory : IComponentFactory> + { + } + } diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs index 90209f5391..71d71d23f9 100644 --- a/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using Microsoft.ML.Runtime.Data; +using Microsoft.ML.Runtime.EntryPoints; namespace Microsoft.ML.Runtime.Ensemble.Selector { @@ -17,4 +18,8 @@ public interface ISubsetSelector } public delegate void SignatureEnsembleDataSelector(); + + public interface ISupportSubsetSelectorFactory : IComponentFactory + { + } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs index f11d687314..790b262806 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs @@ -17,33 +17,33 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector public abstract class BaseDiverseSelector : SubModelDataSelector where TDiversityMetric : class, IDiversityMeasure { - public sealed class Arguments : ArgumentsBase + public abstract class DiverseSelectorArguments : ArgumentsBase { [Argument(ArgumentType.Multiple, HelpText = "The metric type to be used to find the diversity among base learners", ShortName = "dm", SortOrder = 50)] [TGUI(Label = "Diversity Measure Type")] - public SubComponent DiversityMetricType; + public ISupportDiversityMeasureFactory DiversityMetricType; } - private readonly SubComponent _diversityMetricType; + private readonly ISupportDiversityMeasureFactory _diversityMetricType; private ConcurrentDictionary>, TOutput[]> _predictions; public abstract string DiversityMeasureLoadname { get; } - protected internal BaseDiverseSelector(IHostEnvironment env, Arguments args, string name) + protected internal BaseDiverseSelector(IHostEnvironment env, DiverseSelectorArguments args, string name) : base(args, env, name) { _diversityMetricType = args.DiversityMetricType; _predictions = new ConcurrentDictionary>, TOutput[]>(); } - protected TDiversityMetric CreateDiversityMetric() + protected IDiversityMeasure CreateDiversityMetric() { - if (!_diversityMetricType.IsGood()) + if (_diversityMetricType == null) { var sc = new SubComponent(DiversityMeasureLoadname); return sc.CreateInstance(Host); } - return _diversityMetricType.CreateInstance(Host); + return _diversityMetricType.CreateComponent(Host); } public override void CalculateMetrics(FeatureSubsetModel> model, diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs index 917a957190..61b18855ce 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs @@ -9,6 +9,7 @@ using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(BestDiverseSelectorBinary), typeof(BestDiverseSelectorBinary.Arguments), typeof(SignatureEnsembleSubModelSelector), BestDiverseSelectorBinary.UserName, BestDiverseSelectorBinary.LoadName)] @@ -26,6 +27,12 @@ public override string DiversityMeasureLoadname get { return DisagreementDiversityMeasure.LoadName; } } + [TlcModule.Component(Name = BestDiverseSelectorBinary.LoadName, FriendlyName = BestDiverseSelectorBinary.UserName)] + public sealed class Arguments : DiverseSelectorArguments, ISupportSubModelSelectorFactory + { + public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorBinary(env, this); + } + public BestDiverseSelectorBinary(IHostEnvironment env, Arguments args) : base(env, args, LoadName) { @@ -42,5 +49,7 @@ protected override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } } + + } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs index d95b10ff51..dee3c27511 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -10,6 +10,7 @@ using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(BestDiverseSelectorMultiClass), typeof(BestDiverseSelectorMultiClass.Arguments), typeof(SignatureEnsembleSubModelSelector), BestDiverseSelectorMultiClass.UserName, BestDiverseSelectorMultiClass.LoadName)] @@ -27,6 +28,12 @@ public override string DiversityMeasureLoadname get { return MultiDisagreementDiversityMeasure.LoadName; } } + [TlcModule.Component(Name = BestDiverseSelectorMultiClass.LoadName, FriendlyName = BestDiverseSelectorMultiClass.UserName)] + public sealed class Arguments : DiverseSelectorArguments, ISupportSubModelSelectorFactory> + { + public ISubModelSelector> CreateComponent(IHostEnvironment env) => new BestDiverseSelectorMultiClass(env, this); + } + public BestDiverseSelectorMultiClass(IHostEnvironment env, Arguments args) : base(env, args, LoadName) { diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs index 02466599fe..1bd8a0b9ca 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs @@ -9,6 +9,7 @@ using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(BestDiverseSelectorRegression), typeof(BestDiverseSelectorRegression.Arguments), typeof(SignatureEnsembleSubModelSelector), BestDiverseSelectorRegression.UserName, BestDiverseSelectorRegression.LoadName)] @@ -26,6 +27,12 @@ public override string DiversityMeasureLoadname get { return RegressionDisagreementDiversityMeasure.LoadName; } } + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : DiverseSelectorArguments, ISupportSubModelSelectorFactory + { + public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorRegression(env, this); + } + public BestDiverseSelectorRegression(IHostEnvironment env, Arguments args) : base(env, args, LoadName) { diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs index 5ed9afacb9..273c8a682a 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs @@ -8,6 +8,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; [assembly: LoadableClass(typeof(BestPerformanceRegressionSelector), typeof(BestPerformanceRegressionSelector.Arguments), @@ -17,11 +18,14 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { public sealed class BestPerformanceRegressionSelector : BaseBestPerformanceSelector, IRegressionSubModelSelector { - public sealed class Arguments : ArgumentsBase + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : ArgumentsBase, ISupportSubModelSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] [TGUI(Label = "Metric Name")] public RegressionEvaluator.Metrics MetricName = RegressionEvaluator.Metrics.L1; + + public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestPerformanceRegressionSelector(env, this); } public const string UserName = "Best Performance Selector"; public const string LoadName = "BestPerformanceRegressionSelector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs index c8047cabfb..fae99c4b00 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs @@ -8,6 +8,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; [assembly: LoadableClass(typeof(BestPerformanceSelector), typeof(BestPerformanceSelector.Arguments), @@ -17,11 +18,14 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { public sealed class BestPerformanceSelector : BaseBestPerformanceSelector, IBinarySubModelSelector { - public sealed class Arguments : ArgumentsBase + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : ArgumentsBase, ISupportSubModelSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] [TGUI(Label = "Metric Name")] public BinaryClassifierEvaluator.Metrics MetricName = BinaryClassifierEvaluator.Metrics.Auc; + + public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestPerformanceSelector(env, this); } public const string UserName = "Best Performance Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs index 760482de23..0dd1d77acb 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs @@ -8,6 +8,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; [assembly: LoadableClass(typeof(BestPerformanceSelectorMultiClass), typeof(BestPerformanceSelectorMultiClass.Arguments), @@ -17,11 +18,14 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { public class BestPerformanceSelectorMultiClass : BaseBestPerformanceSelector> { - public sealed class Arguments : ArgumentsBase + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : ArgumentsBase,ISupportSubModelSelectorFactory> { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] [TGUI(Label = "Metric Name")] public MultiClassClassifierEvaluator.Metrics MetricName = MultiClassClassifierEvaluator.Metrics.AccuracyMicro; + + public ISubModelSelector> CreateComponent(IHostEnvironment env) => new BestPerformanceSelectorMultiClass(env, this); } public const string UserName = "Best Performance Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs index f3715ec353..c7e228cc9e 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs @@ -20,12 +20,9 @@ public abstract class ArgumentsBase public Single ValidationDatasetProportion = 0.3f; } - private readonly Single _learnersSelectionProportion; - private readonly Single _validationDatasetProportion; + public Single LearnersSelectionProportion { get; } - public Single LearnersSelectionProportion { get { return _learnersSelectionProportion; } } - - public override Single ValidationDatasetProportion { get { return _validationDatasetProportion; } } + public override Single ValidationDatasetProportion { get; } protected SubModelDataSelector(ArgumentsBase args, IHostEnvironment env, string name) : base(env, name) @@ -37,8 +34,8 @@ protected SubModelDataSelector(ArgumentsBase args, IHostEnvironment env, string Host.CheckParam(0 < args.LearnersSelectionProportion && args.LearnersSelectionProportion < 1, nameof(args.LearnersSelectionProportion), "Should be greater than 0 and less than 1"); - _learnersSelectionProportion = args.LearnersSelectionProportion; - _validationDatasetProportion = args.ValidationDatasetProportion; + LearnersSelectionProportion = args.LearnersSelectionProportion; + ValidationDatasetProportion = args.ValidationDatasetProportion; } } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs index 7bd84c914f..704ea0cbfa 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs @@ -6,6 +6,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(AllInstanceSelector), typeof(AllInstanceSelector.Arguments), typeof(SignatureEnsembleDataSelector), AllInstanceSelector.UserName, AllInstanceSelector.LoadName)] @@ -17,8 +18,10 @@ public sealed class AllInstanceSelector : BaseSubsetSelector new AllInstanceSelector(env, this); } public AllInstanceSelector(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs index cf1a659a64..dee1fb6d42 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs @@ -7,6 +7,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(BootstrapSelector), typeof(BootstrapSelector.Arguments), typeof(SignatureEnsembleDataSelector), BootstrapSelector.UserName, BootstrapSelector.LoadName)] @@ -18,16 +19,18 @@ public sealed class BootstrapSelector : BaseSubsetSelector new BootstrapSelector(env, this); } public BootstrapSelector(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs index 876642bb00..c1bb6232e0 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs @@ -8,6 +8,7 @@ using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(RandomPartitionSelector),typeof(RandomPartitionSelector.Arguments), typeof(SignatureEnsembleDataSelector),RandomPartitionSelector.UserName, RandomPartitionSelector.LoadName)] @@ -19,8 +20,10 @@ public sealed class RandomPartitionSelector : BaseSubsetSelector new RandomPartitionSelector(env, this); } public RandomPartitionSelector(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs index a5eebad10f..0c14d10db9 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs @@ -13,6 +13,7 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Ensemble.EntryPoints; [assembly: LoadableClass(EnsembleTrainer.Summary, typeof(EnsembleTrainer), typeof(EnsembleTrainer.Arguments), new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer) }, @@ -38,8 +39,8 @@ public sealed class Arguments : ArgumentsBase public Arguments() { BasePredictors = new[] { new SubComponent, SignatureBinaryClassifierTrainer>("LinearSVM") }; - OutputCombiner = new SubComponent(Median.LoadName); - SubModelSelectorType = new SubComponent(AllSelector.LoadName); + OutputCombiner = new MedianFactory(); + SubModelSelectorType = new AllSelectorFactory(); } } @@ -65,7 +66,7 @@ public TScalarPredictor CombineModels(IEnumerable m.Weight).ToArray(); if (weights.All(w => w == 1)) weights = null; - var combiner = Args.OutputCombiner.CreateInstance(Host); + var combiner = Args.OutputCombiner.CreateComponent(Host); var p = models.First().Value; TScalarPredictor predictor = null; diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs index 724b57c45f..952c624893 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs @@ -39,8 +39,7 @@ public abstract class ArgumentsBase [Argument(ArgumentType.Multiple, HelpText = "Sampling Type", ShortName = "st", SortOrder = 2)] [TGUI(Label = "Sampling Type", Description = "Subset Selection Algorithm to induce the base learner.Sub-settings can be used to select the features")] - public SubComponent SamplingType - = new SubComponent(BootstrapSelector.LoadName); + public ISupportSubsetSelectorFactory SamplingType = new BootstrapSelector.Arguments(); [Argument(ArgumentType.AtMostOnce, HelpText = "All the base learners will run asynchronously if the value is true", ShortName = "tp", SortOrder = 106)] [TGUI(Label = "Train parallel", Description = "All the base learners will run asynchronously if the value is true")] @@ -54,12 +53,12 @@ public SubComponent SamplingType [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] [TGUI(Label = "Output combiner", Description = "Output combiner type")] - public SubComponent OutputCombiner; + public ISupportOutputCombinerFactory OutputCombiner; [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] [TGUI(Label = "Sub-Model Selector(pruning) Type", Description = "Algorithm to prune the base learners for selective Ensemble")] - public SubComponent SubModelSelectorType; + public ISupportSubModelSelectorFactory SubModelSelectorType; [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1)] public SubComponent>, TSig>[] BasePredictors; @@ -75,9 +74,9 @@ public SubComponent SamplingType protected readonly ITrainer>[] Trainers; private readonly ISubsetSelector _subsetSelector; - private readonly TSelector _subModelSelector; + private readonly ISubModelSelector _subModelSelector; - protected readonly TCombiner Combiner; + protected readonly IOutputCombiner Combiner; protected List>> Models; @@ -101,9 +100,9 @@ internal EnsembleTrainerBase(ArgumentsBase args, IHostEnvironment env, string na if (Utils.Size(Args.BasePredictors) > NumModels) ch.Warning("The base predictor count is greater than models count. Some of the base predictors will be ignored."); - _subsetSelector = Args.SamplingType.CreateInstance(Host); - _subModelSelector = Args.SubModelSelectorType.CreateInstance(Host); - Combiner = Args.OutputCombiner.CreateInstance(Host); + _subsetSelector = Args.SamplingType.CreateComponent(Host); + _subModelSelector = Args.SubModelSelectorType.CreateComponent(Host); + Combiner = Args.OutputCombiner.CreateComponent(Host); Trainers = new ITrainer>[NumModels]; for (int i = 0; i < Trainers.Length; i++) diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs index f53412c922..c297526508 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; @@ -28,7 +29,7 @@ namespace Microsoft.ML.Runtime.Ensemble /// public sealed class MulticlassDataPartitionEnsembleTrainer : EnsembleTrainerBase, EnsembleMultiClassPredictor, - ISubModelSelector>, IOutputCombiner>, SignatureMultiClassClassifierTrainer>, + IMulticlassSubModelSelector, IOutputCombiner>, SignatureMultiClassClassifierTrainer>, IModelCombiner, TVectorPredictor> { public const string LoadNameValue = "WeightedEnsembleMulticlass"; @@ -40,8 +41,8 @@ public sealed class Arguments : ArgumentsBase public Arguments() { BasePredictors = new[] { new SubComponent, SignatureMultiClassClassifierTrainer>("MultiClassLogisticRegression") }; - OutputCombiner = new SubComponent>, SignatureCombiner>(MultiMedian.LoadName); - SubModelSelectorType = new SubComponent>, SignatureEnsembleSubModelSelector>(AllSelectorMultiClass.LoadName); + OutputCombiner = new MultiMedian.Arguments(); + SubModelSelectorType = new AllSelectorMultiClassFactory(); } } @@ -66,7 +67,7 @@ public TVectorPredictor CombineModels(IEnumerable new FeatureSubsetModel(k.Value)).ToArray(), - Args.OutputCombiner.CreateInstance(Host), weights); + Args.OutputCombiner.CreateComponent(Host), weights); return predictor; } diff --git a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs index 7956f835a4..10349dbbd7 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; @@ -34,8 +35,8 @@ public sealed class Arguments : ArgumentsBase public Arguments() { BasePredictors = new[] { new SubComponent, SignatureRegressorTrainer>("OnlineGradientDescent") }; - OutputCombiner = new SubComponent(Median.LoadName); - SubModelSelectorType = new SubComponent(AllSelector.LoadName); + OutputCombiner = new MedianFactory(); + SubModelSelectorType = new AllSelectorFactory(); } } @@ -59,7 +60,7 @@ public TScalarPredictor CombineModels(IEnumerable m.Weight).ToArray(); if (weights.All(w => w == 1)) weights = null; - var combiner = Args.OutputCombiner.CreateInstance(Host); + var combiner = Args.OutputCombiner.CreateComponent(Host); var p = models.First().Value; var predictor = new EnsemblePredictor(Host, p.PredictionKind, From 9346554c8cb7f6036d2393d85cc42074e6f051ad Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Mon, 18 Jun 2018 17:23:10 -0700 Subject: [PATCH 03/20] small cleanup --- .../EntryPoints/OutputCombiner.cs | 10 ++- .../OutputCombiners/MultiMedian.cs | 4 +- .../OutputCombiners/MultiStacking.cs | 1 - .../BestDiverseSelectorBinary.cs | 2 - .../Microsoft.ML.Predictor.Tests.csproj | 1 + .../TestPredictors.cs | 73 ++++++++++++++++++- test/Microsoft.ML.TestFramework/Datasets.cs | 9 +-- test/Microsoft.ML.TestFramework/Learners.cs | 37 ++++++++++ test/Microsoft.ML.Tests/CSharpCodeGen.cs | 2 +- 9 files changed, 117 insertions(+), 22 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs index 0f2ede3d66..83a4efed97 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs @@ -1,9 +1,11 @@ -using Microsoft.ML.Runtime; +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.EntryPoints; -using System; -using System.Collections.Generic; -using System.Text; namespace Microsoft.ML.Ensemble.EntryPoints { diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs index 3360ea0378..5a9452ab1e 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs @@ -12,7 +12,7 @@ [assembly: LoadableClass(typeof(MultiMedian), typeof(MultiMedian.Arguments), typeof(SignatureCombiner), Median.UserName, MultiMedian.LoadName)] -[assembly: LoadableClass(typeof(MultiMedian), null, typeof(SignatureLoadModel),Median.UserName, MultiMedian.LoaderSignature)] +[assembly: LoadableClass(typeof(MultiMedian), null, typeof(SignatureLoadModel), Median.UserName, MultiMedian.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { @@ -98,7 +98,5 @@ public override Combiner> GetCombiner() dst = new VBuffer(len, values, dst.Indices); }; } - - } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs index 7aa2f2c059..623fb1694f 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs @@ -11,7 +11,6 @@ using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Model; - [assembly: LoadableClass(typeof(MultiStacking), typeof(MultiStacking.Arguments), typeof(SignatureCombiner), Stacking.UserName, MultiStacking.LoadName)] diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs index 61b18855ce..dc9715ed3b 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs @@ -49,7 +49,5 @@ protected override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } } - - } } diff --git a/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj b/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj index 2329491e08..a8955ede87 100644 --- a/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj +++ b/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj @@ -12,6 +12,7 @@ + diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index 91513e0e74..3ac470a767 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -45,7 +45,7 @@ public IList GetDatasetsForMulticlassClassifierTest() /// public IList GetDatasetsForRegressorTest() { - return new[] { TestDatasets.housing }; + return new[] { TestDatasets.winequality }; } /// @@ -350,6 +350,70 @@ public void FastForestRegressionTest() Done(); } + [Fact] + [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] + public void RegressorEnsembleTest() + { + IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegression }; + IList regressionDatasets = GetDatasetsForRegressorTest(); + RunAllTests(regressionPredictors, regressionDatasets); + Done(); + } + + [Fact] + [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] + public void RegressorEnsembleNumModelsTest() + { + IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionNumModels }; + IList regressionDatasets = GetDatasetsForRegressorTest(); + RunAllTests(regressionPredictors, regressionDatasets); + Done(); + } + + [Fact] + [ TestCategory("Regressor"), TestCategory("Ensemble - Regression")] + public void RegressorEnsembleAverageCombinerTest() + { + IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionAverageCombiner, }; + IList regressionDatasets = GetDatasetsForRegressorTest(); + RunAllTests(regressionPredictors, regressionDatasets); + Done(); + } + + [Fact] + [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] + public void RegressorEnsembleDiverseSelectorTest() + { + IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionDiverseSelector }; + IList regressionDatasets = GetDatasetsForRegressorTest(); + RunAllTests(regressionPredictors, regressionDatasets); + Done(); + } + + [Fact] + [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] + public void RegressorEnsemblePerformanceSelectorTest() + { + IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionPerformanceSelector }; + IList regressionDatasets = GetDatasetsForRegressorTest(); + RunAllTests(regressionPredictors, regressionDatasets); + Done(); + } + + [Fact] + [ TestCategory("Regressor"), TestCategory("Ensemble - Regression"), TestCategory("FastTree")] + public void RegressorEnsembleStackingCombinerTest() + { + IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionStackingCombiner }; + IList regressionDatasets = GetDatasetsForRegressorTest(); + RunMTAThread(() => + { + // Default is a FastTree learner, so we have to push it into an MTA thread. + RunAllTests(regressionPredictors, regressionDatasets); + }); + Done(); + } + [Fact(Skip = "Need CoreTLC specific baseline update")] [TestCategory("Weighting Predictors")] [TestCategory("FastForest")] @@ -565,7 +629,7 @@ public void FastTreeUnderbuiltRegressionTest() // case where the number of actual leaves is less than the number of maximum leaves per tree. RunMTAThread(() => { - Run_TrainTest(TestLearners.FastTreeUnderbuiltRegressor, TestDatasets.housing, null, "Underbuilt"); + Run_TrainTest(TestLearners.FastTreeUnderbuiltRegressor, TestDatasets.winequality, null, "Underbuilt"); }); Done(); } @@ -633,7 +697,7 @@ public void RegressorOlsTest() [TestCategory("Regressor")] public void RegressorOlsTestOne() { - Run_TrainTest(TestLearners.Ols, TestDatasets.housing); + Run_TrainTest(TestLearners.Ols, TestDatasets.winequality); Done(); } @@ -1689,4 +1753,7 @@ public void TestFeatureHandlerModelReuse() } } #endif + + + } diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 76bea677b6..a3cae62e9a 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -145,19 +145,12 @@ public static class TestDatasets testFilename = "vw.dat" }; - public static TestDataset housing = new TestDataset - { - name = "housing", - trainFilename = "housing.txt", - testFilename = "housing.txt" - }; - public static TestDataset winequality = new TestDataset { name = "wine", trainFilename = "external/winequality-white.csv", testFilename = "external/winequality-white.csv", - loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=; header+" + loaderSettings = "loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=semicolon header+}" }; public static TestDataset msm = new TestDataset diff --git a/test/Microsoft.ML.TestFramework/Learners.cs b/test/Microsoft.ML.TestFramework/Learners.cs index 9b64c0ac74..450bf98ac4 100644 --- a/test/Microsoft.ML.TestFramework/Learners.cs +++ b/test/Microsoft.ML.TestFramework/Learners.cs @@ -748,5 +748,42 @@ public static PredictorAndArgs DssmDefault(int qryFeaturesCount, int docFeatures MamlArgs = new[] { "xf=Copy{col=DupFeatures:Features} xf=MinMax{col=Features col=DupFeatures} norm=No", "col[Feature]=DupFeatures" }, BaselineProgress = true }; + + public static PredictorAndArgs EnsembleRegression = new PredictorAndArgs + { + Trainer = new SubComponent("EnsembleRegression"), + Tag = "Default", + }; + + public static PredictorAndArgs EnsembleRegressionNumModels = new PredictorAndArgs + { + Trainer = new SubComponent("EnsembleRegression", "bp=OGD bp=FTR"), + Tag = "NumModels", + }; + + public static PredictorAndArgs EnsembleRegressionDiverseSelector = new PredictorAndArgs + { + Trainer = new SubComponent("EnsembleRegression", "pt = BestDiverseSelectorRegression"), + Tag = "DiverseRegression", + }; + + public static PredictorAndArgs EnsembleRegressionPerformanceSelector = new PredictorAndArgs + { + Trainer = new SubComponent("EnsembleRegression", "pt = BestPerformanceRegressionSelector"), + Tag = "PerformanceSelector", + }; + + public static PredictorAndArgs EnsembleRegressionAverageCombiner = new PredictorAndArgs + { + Trainer = new SubComponent("EnsembleRegression", "oc=Average"), + Tag = "Average", + }; + + public static PredictorAndArgs EnsembleRegressionStackingCombiner = new PredictorAndArgs + { + Trainer = new SubComponent("EnsembleRegression", "oc=RegressionStacking"), + Tag = "RegressionStacking", + }; + } } \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/CSharpCodeGen.cs b/test/Microsoft.ML.Tests/CSharpCodeGen.cs index 678edac461..5ee92d1c5e 100644 --- a/test/Microsoft.ML.Tests/CSharpCodeGen.cs +++ b/test/Microsoft.ML.Tests/CSharpCodeGen.cs @@ -16,7 +16,7 @@ public CSharpCodeGen(ITestOutputHelper output) : base(output) { } - [Fact(Skip = "Execute this test if you want to regenerate CSharpApi file")] + [Fact] public void RegenerateCSharpApi() { var basePath = GetDataPath("../../src/Microsoft.ML/CSharpApi.cs"); From 40e115b8a1589e8e5e191709435fd27a28c8a204 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 19 Jun 2018 13:23:32 -0700 Subject: [PATCH 04/20] add all required EP attributes, add entry points for ensembles. --- .../EntryPoints/DiversityMeasure.cs | 5 + .../EntryPoints/Ensemble.cs | 55 + .../EntryPoints/FeatureSelector.cs | 4 + .../EntryPoints/OutputCombiner.cs | 13 + .../EntryPoints/SubModelSelector.cs | 10 + .../OutputCombiners/BaseStacking.cs | 6 +- .../OutputCombiners/IOutputCombiner.cs | 1 + .../OutputCombiners/MultiAverage.cs | 4 +- .../OutputCombiners/MultiVoting.cs | 2 +- .../Selector/IDiversityMeasure.cs | 1 + .../Selector/IFeatureSelector.cs | 2 + .../Selector/ISubModelSelector.cs | 1 + .../Selector/ISubsetSelector.cs | 1 + .../SubsetSelector/AllInstanceSelector.cs | 2 + .../SubsetSelector/BaseSubsetSelector.cs | 7 +- .../SubsetSelector/BootstrapSelector.cs | 2 + .../SubsetSelector/RandomPartitionSelector.cs | 2 + .../Trainer/EnsembleDistributionPredictor.cs | 28 +- .../Trainer/EnsemblePredictor.cs | 22 +- .../Trainer/EnsembleTrainerBase.cs | 5 +- src/Microsoft.ML/CSharpApi.cs | 776 + .../Common/EntryPoints/core_ep-list.tsv | 3 + .../Common/EntryPoints/core_manifest.json | 13194 +++++++++------- test/Microsoft.ML.Tests/CSharpCodeGen.cs | 2 +- 24 files changed, 8141 insertions(+), 6007 deletions(-) create mode 100644 src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs b/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs index 9d605b9fb7..6dcaedd3ca 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs @@ -3,12 +3,17 @@ // See the LICENSE file in the project root for more information. using System; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; using Microsoft.ML.Runtime.EntryPoints; +[assembly: EntryPointModule(typeof(DisagreementDiversityFactory))] +[assembly: EntryPointModule(typeof(RegressionDisagreementDiversityFactory))] +[assembly: EntryPointModule(typeof(MultinDisagreementDiversityFactory))] + namespace Microsoft.ML.Ensemble.EntryPoints { [TlcModule.Component(Name = DisagreementDiversityMeasure.LoadName, FriendlyName = DisagreementDiversityMeasure.UserName)] diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs new file mode 100644 index 0000000000..cb72aac2ae --- /dev/null +++ b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs @@ -0,0 +1,55 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Ensemble.EntryPoints; +using Microsoft.ML.Runtime; +using Microsoft.ML.Runtime.Ensemble; +using Microsoft.ML.Runtime.EntryPoints; + +[assembly: LoadableClass(typeof(void), typeof(Ensemble), null, typeof(SignatureEntryPointModule), "TrainEnsemble")] + +namespace Microsoft.ML.Ensemble.EntryPoints +{ + public static class Ensemble + { + [TlcModule.EntryPoint(Name = "Trainer.BinaryEnsemble", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] + public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHostEnvironment env, EnsembleTrainer.Arguments input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("TrainBinaryEnsemble"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + return LearnerEntryPointsUtils.Train(host, input, + () => new EnsembleTrainer(host, input), + () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); + } + + [TlcModule.EntryPoint(Name = "Trainer.ClassificationEnsemble", Desc = "Train multiclass ensemble.", UserName = EnsembleTrainer.UserNameValue)] + public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassEnsemble(IHostEnvironment env, MulticlassDataPartitionEnsembleTrainer.Arguments input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("TrainMultiClassEnsemble"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + return LearnerEntryPointsUtils.Train(host, input, + () => new MulticlassDataPartitionEnsembleTrainer(host, input), + () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); + } + + [TlcModule.EntryPoint(Name = "Trainer.RegressionEnsemble", Desc = "Train regression ensemble.", UserName = EnsembleTrainer.UserNameValue)] + public static CommonOutputs.RegressionOutput CreateRegressionEnsemble(IHostEnvironment env, RegressionEnsembleTrainer.Arguments input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("TrainRegressionEnsemble"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + return LearnerEntryPointsUtils.Train(host, input, + () => new RegressionEnsembleTrainer(host, input), + () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); + } + } +} diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs b/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs index bc70fdb649..65ca5e9d06 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/FeatureSelector.cs @@ -2,11 +2,15 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; using Microsoft.ML.Runtime.EntryPoints; +[assembly: EntryPointModule(typeof(AllFeatureSelectorFactory))] +[assembly: EntryPointModule(typeof(RandomFeatureSelector))] + namespace Microsoft.ML.Ensemble.EntryPoints { [TlcModule.Component(Name = AllFeatureSelector.LoadName, FriendlyName = AllFeatureSelector.UserName)] diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs index 83a4efed97..0a3c0e7b0c 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs @@ -3,10 +3,23 @@ // See the LICENSE file in the project root for more information. using System; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.EntryPoints; +[assembly: EntryPointModule(typeof(AverageFactory))] +[assembly: EntryPointModule(typeof(MedianFactory))] +[assembly: EntryPointModule(typeof(MultiAverage))] +[assembly: EntryPointModule(typeof(MultiMedian))] +[assembly: EntryPointModule(typeof(MultiStacking))] +[assembly: EntryPointModule(typeof(MultiVoting))] +[assembly: EntryPointModule(typeof(MultiWeightedAverage))] +[assembly: EntryPointModule(typeof(RegressionStacking))] +[assembly: EntryPointModule(typeof(Stacking))] +[assembly: EntryPointModule(typeof(VotingFactory))] +[assembly: EntryPointModule(typeof(WeightedAverage))] + namespace Microsoft.ML.Ensemble.EntryPoints { [TlcModule.Component(Name = Average.LoadName, FriendlyName = Average.UserName)] diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs index 57e397c789..2f3a42bc9c 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs @@ -3,12 +3,22 @@ // See the LICENSE file in the project root for more information. using System; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; using Microsoft.ML.Runtime.EntryPoints; +[assembly: EntryPointModule(typeof(AllSelectorFactory))] +[assembly: EntryPointModule(typeof(AllSelectorMultiClassFactory))] +[assembly: EntryPointModule(typeof(BestDiverseSelectorBinary))] +[assembly: EntryPointModule(typeof(BestDiverseSelectorMultiClass))] +[assembly: EntryPointModule(typeof(BestDiverseSelectorRegression))] +[assembly: EntryPointModule(typeof(BestPerformanceRegressionSelector))] +[assembly: EntryPointModule(typeof(BestPerformanceSelector))] +[assembly: EntryPointModule(typeof(BestPerformanceSelectorMultiClass))] + namespace Microsoft.ML.Ensemble.EntryPoints { [TlcModule.Component(Name = AllSelector.LoadName, FriendlyName = AllSelector.UserName)] diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs index 6b9badd1c5..96907347ee 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs @@ -19,11 +19,13 @@ public abstract class BaseStacking : IStackingTrainer>, TSigBase> BasePredictorType; } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs index 1995114d20..836b560e8f 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs @@ -42,6 +42,7 @@ public interface IBinaryOutputCombiner : IOutputCombiner { } + [TlcModule.ComponentKind("EnsembleOutputCombiner")] public interface ISupportOutputCombinerFactory : IComponentFactory> { } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs index f1cde7111f..f657ab2342 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs @@ -11,8 +11,8 @@ [assembly: LoadableClass(typeof(MultiAverage), typeof(MultiAverage.Arguments), typeof(SignatureCombiner), Average.UserName, MultiAverage.LoadName)] -[assembly: LoadableClass(typeof(MultiAverage), null, typeof(SignatureLoadModel), - Average.UserName, MultiAverage.LoadName, MultiAverage.LoaderSignature)] +[assembly: LoadableClass(typeof(MultiAverage), null, typeof(SignatureLoadModel), Average.UserName, + MultiAverage.LoadName, MultiAverage.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs index 3d6fcc2130..c1f9af36b1 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiVoting.cs @@ -11,7 +11,7 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Numeric; -[assembly: LoadableClass(typeof(MultiVoting), null, typeof(SignatureCombiner), Voting.UserName, MultiVoting.LoadName)] +[assembly: LoadableClass(typeof(MultiVoting), typeof(MultiVoting.Arguments), typeof(SignatureCombiner), Voting.UserName, MultiVoting.LoadName)] [assembly: LoadableClass(typeof(MultiVoting), null, typeof(SignatureLoadModel), Voting.UserName, MultiVoting.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners diff --git a/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs index 0319a325c9..814cbe317c 100644 --- a/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs +++ b/src/Microsoft.ML.Ensemble/Selector/IDiversityMeasure.cs @@ -17,6 +17,7 @@ List> CalculateDiversityMeasure(IList : IComponentFactory> { } diff --git a/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs index a854b50ba9..99e90c5c01 100644 --- a/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/IFeatureSelector.cs @@ -13,6 +13,8 @@ public interface IFeatureSelector } public delegate void SignatureEnsembleFeatureSelector(); + + [TlcModule.ComponentKind("EnsembleFeatureSelector")] public interface ISupportFeatureSelectorFactory : IComponentFactory { } diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs index 2e99a5ffdf..1bffb977b1 100644 --- a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs @@ -33,6 +33,7 @@ public interface IMulticlassSubModelSelector : ISubModelSelector public delegate void SignatureEnsembleSubModelSelector(); + [TlcModule.ComponentKind("EnsembleSubModelSelector")] public interface ISupportSubModelSelectorFactory : IComponentFactory> { } diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs index 71d71d23f9..2a0f088219 100644 --- a/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/ISubsetSelector.cs @@ -19,6 +19,7 @@ public interface ISubsetSelector public delegate void SignatureEnsembleDataSelector(); + [TlcModule.ComponentKind("EnsembleSubsetSelector")] public interface ISupportSubsetSelectorFactory : IComponentFactory { } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs index 704ea0cbfa..98e6eeb8eb 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/AllInstanceSelector.cs @@ -11,6 +11,8 @@ [assembly: LoadableClass(typeof(AllInstanceSelector), typeof(AllInstanceSelector.Arguments), typeof(SignatureEnsembleDataSelector), AllInstanceSelector.UserName, AllInstanceSelector.LoadName)] +[assembly: EntryPointModule(typeof(AllInstanceSelector))] + namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector { public sealed class AllInstanceSelector : BaseSubsetSelector diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs index d3a6206502..b3455370a1 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs @@ -4,9 +4,9 @@ using System; using System.Collections.Generic; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.Ensemble.Selector.FeatureSelector; namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector { @@ -16,8 +16,7 @@ public abstract class BaseSubsetSelector : ISubsetSelector public abstract class ArgumentsBase { [Argument(ArgumentType.Multiple, HelpText = "The Feature selector", ShortName = "fs", SortOrder = 1)] - public SubComponent FeatureSelector = - new SubComponent(AllFeatureSelector.LoadName); + public ISupportFeatureSelectorFactory FeatureSelector = new AllFeatureSelectorFactory(); } protected readonly IHost Host; @@ -37,7 +36,7 @@ protected BaseSubsetSelector(TArgs args, IHostEnvironment env, string name) Host = env.Register(name); Args = args; - FeatureSelector = Args.FeatureSelector.CreateInstance(Host); + FeatureSelector = Args.FeatureSelector.CreateComponent(Host); } public void Initialize(RoleMappedData data, int size, int batchSize, Single validationDatasetProportion) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs index dee1fb6d42..f226f9b866 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs @@ -12,6 +12,8 @@ [assembly: LoadableClass(typeof(BootstrapSelector), typeof(BootstrapSelector.Arguments), typeof(SignatureEnsembleDataSelector), BootstrapSelector.UserName, BootstrapSelector.LoadName)] +[assembly: EntryPointModule(typeof(BootstrapSelector))] + namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector { public sealed class BootstrapSelector : BaseSubsetSelector diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs index c1bb6232e0..7d07935a1e 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs @@ -13,6 +13,8 @@ [assembly: LoadableClass(typeof(RandomPartitionSelector),typeof(RandomPartitionSelector.Arguments), typeof(SignatureEnsembleDataSelector),RandomPartitionSelector.UserName, RandomPartitionSelector.LoadName)] +[assembly: EntryPointModule(typeof(RandomPartitionSelector))] + namespace Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector { public sealed class RandomPartitionSelector : BaseSubsetSelector diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs index b183162f87..e9cbc4fc3c 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs @@ -44,32 +44,32 @@ private static VersionInfo GetVersionInfo() private readonly Single[] _averagedWeights; - private readonly PredictionKind _kind; private readonly Median _probabilityCombiner; - private readonly ColumnType _inputType; private readonly IValueMapperDist[] _mappers; - public ColumnType InputType { get { return _inputType; } } + public ColumnType InputType { get; } public ColumnType OutputType { get { return NumberType.Float; } } public ColumnType DistType { get { return NumberType.Float; } } + public override PredictionKind PredictionKind { get; } + internal EnsembleDistributionPredictor(IHostEnvironment env, PredictionKind kind, FeatureSubsetModel[] models, IOutputCombiner combiner, Single[] weights = null) : base(env, RegistrationName, models, combiner, weights) { - _kind = kind; + PredictionKind = kind; _probabilityCombiner = new Median(env); - _inputType = InitializeMappers(out _mappers); + InputType = InitializeMappers(out _mappers); ComputeAveragedWeights(out _averagedWeights); } private EnsembleDistributionPredictor(IHostEnvironment env, ModelLoadContext ctx) : base(env, RegistrationName, ctx) { - _kind = (PredictionKind)ctx.Reader.ReadInt32(); + PredictionKind = (PredictionKind)ctx.Reader.ReadInt32(); _probabilityCombiner = new Median(env); - _inputType = InitializeMappers(out _mappers); + InputType = InitializeMappers(out _mappers); ComputeAveragedWeights(out _averagedWeights); } @@ -118,11 +118,11 @@ protected override void SaveCore(ModelSaveContext ctx) ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** - // int: _kind - ctx.Writer.Write((int)_kind); + // int: PredictionKind + ctx.Writer.Write((int)PredictionKind); } - public override PredictionKind PredictionKind { get { return _kind; } } + public ValueMapper GetMapper() { @@ -137,8 +137,8 @@ public ValueMapper GetMapper() ValueMapper, Single> del = (ref VBuffer src, ref Single dst) => { - if (_inputType.VectorSize > 0) - Host.Check(src.Length == _inputType.VectorSize); + if (InputType.VectorSize > 0) + Host.Check(src.Length == InputType.VectorSize); var tmp = src; Parallel.For(0, maps.Length, i => @@ -175,8 +175,8 @@ public ValueMapper GetMapper() ValueMapper, Single, Single> del = (ref VBuffer src, ref Single score, ref Single prob) => { - if (_inputType.VectorSize > 0) - Host.Check(src.Length == _inputType.VectorSize); + if (InputType.VectorSize > 0) + Host.Check(src.Length == InputType.VectorSize); var tmp = src; Parallel.For(0, maps.Length, i => diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs index a5b5aa026d..3e51ee8a90 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs @@ -9,9 +9,11 @@ using Microsoft.ML.Runtime.Ensemble; using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; +using Microsoft.ML.Runtime.EntryPoints; [assembly: LoadableClass(typeof(EnsemblePredictor), null, typeof(SignatureLoadModel), EnsemblePredictor.UserName, EnsemblePredictor.LoaderSignature)] +[assembly: EntryPointModule(typeof(EnsemblePredictor))] namespace Microsoft.ML.Runtime.Ensemble { @@ -36,27 +38,25 @@ private static VersionInfo GetVersionInfo() loaderSignature: LoaderSignature); } - private readonly PredictionKind _kind; - private readonly ColumnType _inputType; private readonly IValueMapper[] _mappers; - public ColumnType InputType { get { return _inputType; } } + public ColumnType InputType { get; } public ColumnType OutputType { get { return NumberType.Float; } } - public override PredictionKind PredictionKind { get { return _kind; } } + public override PredictionKind PredictionKind { get; } internal EnsemblePredictor(IHostEnvironment env, PredictionKind kind, FeatureSubsetModel[] models, IOutputCombiner combiner, Single[] weights = null) : base(env, LoaderSignature, models, combiner, weights) { - _kind = kind; - _inputType = InitializeMappers(out _mappers); + PredictionKind = kind; + InputType = InitializeMappers(out _mappers); } private EnsemblePredictor(IHostEnvironment env, ModelLoadContext ctx) : base(env, RegistrationName, ctx) { - _kind = (PredictionKind)ctx.Reader.ReadInt32(); - _inputType = InitializeMappers(out _mappers); + PredictionKind = (PredictionKind)ctx.Reader.ReadInt32(); + InputType = InitializeMappers(out _mappers); } private ColumnType InitializeMappers(out IValueMapper[] mappers) @@ -105,7 +105,7 @@ protected override void SaveCore(ModelSaveContext ctx) // *** Binary format *** // int: _kind - ctx.Writer.Write((int)_kind); + ctx.Writer.Write((int)PredictionKind); } public ValueMapper GetMapper() @@ -123,8 +123,8 @@ public ValueMapper GetMapper() ValueMapper, Single> del = (ref VBuffer src, ref Single dst) => { - if (_inputType.VectorSize > 0) - Host.Check(src.Length == _inputType.VectorSize); + if (InputType.VectorSize > 0) + Host.Check(src.Length == InputType.VectorSize); var tmp = src; Parallel.For(0, maps.Length, i => diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs index 952c624893..718709a87a 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs @@ -11,6 +11,7 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubsetSelector; +using Microsoft.ML.Runtime.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; using Microsoft.ML.Runtime.Internal.Utilities; using Microsoft.ML.Runtime.Training; @@ -23,7 +24,7 @@ public abstract class EnsembleTrainerBase where TCombiner : class, IOutputCombiner { - public abstract class ArgumentsBase + public abstract class ArgumentsBase : LearnerInputBaseWithLabel { [Argument(ArgumentType.AtMostOnce, HelpText = "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, " + @@ -60,7 +61,7 @@ public abstract class ArgumentsBase Description = "Algorithm to prune the base learners for selective Ensemble")] public ISupportSubModelSelectorFactory SubModelSelectorType; - [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1)] + [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1, Visibility =ArgumentAttribute.VisibilityType.CmdLineOnly)] public SubComponent>, TSig>[] BasePredictors; public const int DefaultNumModels = 50; diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index 58ea963f37..f4b46427d5 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -478,6 +478,42 @@ public void Add(Microsoft.ML.Models.TrainTestEvaluator input, Microsoft.ML.Model _jsonNodes.Add(Serialize("Models.TrainTestEvaluator", input, output)); } + public Microsoft.ML.Trainer.BinaryEnsemble.Output Add(Microsoft.ML.Trainer.BinaryEnsemble input) + { + var output = new Microsoft.ML.Trainer.BinaryEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainer.BinaryEnsemble input, Microsoft.ML.Trainer.BinaryEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Trainer.BinaryEnsemble", input, output)); + } + + public Microsoft.ML.Trainer.ClassificationEnsemble.Output Add(Microsoft.ML.Trainer.ClassificationEnsemble input) + { + var output = new Microsoft.ML.Trainer.ClassificationEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainer.ClassificationEnsemble input, Microsoft.ML.Trainer.ClassificationEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Trainer.ClassificationEnsemble", input, output)); + } + + public Microsoft.ML.Trainer.RegressionEnsemble.Output Add(Microsoft.ML.Trainer.RegressionEnsemble input) + { + var output = new Microsoft.ML.Trainer.RegressionEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainer.RegressionEnsemble input, Microsoft.ML.Trainer.RegressionEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Trainer.RegressionEnsemble", input, output)); + } + public Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input) { var output = new Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output(); @@ -3969,6 +4005,339 @@ public sealed class Output } } + namespace Trainer + { + + /// + /// Train binary ensemble. + /// + public sealed partial class BinaryEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. + /// + public int? NumModels { get; set; } + + /// + /// Batch size + /// + public int BatchSize { get; set; } = -1; + + /// + /// Sampling Type + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); + + /// + /// All the base learners will run asynchronously if the value is true + /// + public bool TrainParallel { get; set; } = false; + + /// + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set + /// + public bool ShowMetrics { get; set; } = false; + + /// + /// Output combiner + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleOutputCombiner(); + + /// + /// Algorithm to prune the base learners for selective Ensemble + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleSubModelSelector(); + + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(BinaryEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new BinaryEnsemblePipelineStep(output); + } + + private class BinaryEnsemblePipelineStep : ILearningPipelinePredictorStep + { + public BinaryEnsemblePipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + + namespace Trainer + { + + /// + /// Train multiclass ensemble. + /// + public sealed partial class ClassificationEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. + /// + public int? NumModels { get; set; } + + /// + /// Batch size + /// + public int BatchSize { get; set; } = -1; + + /// + /// Sampling Type + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); + + /// + /// All the base learners will run asynchronously if the value is true + /// + public bool TrainParallel { get; set; } = false; + + /// + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set + /// + public bool ShowMetrics { get; set; } = false; + + /// + /// Output combiner + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleOutputCombiner OutputCombiner { get; set; } = new MultiMedianEnsembleOutputCombiner(); + + /// + /// Algorithm to prune the base learners for selective Ensemble + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorMultiClassEnsembleSubModelSelector(); + + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(ClassificationEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new ClassificationEnsemblePipelineStep(output); + } + + private class ClassificationEnsemblePipelineStep : ILearningPipelinePredictorStep + { + public ClassificationEnsemblePipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + + namespace Trainer + { + + /// + /// Train regression ensemble. + /// + public sealed partial class RegressionEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. + /// + public int? NumModels { get; set; } + + /// + /// Batch size + /// + public int BatchSize { get; set; } = -1; + + /// + /// Sampling Type + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); + + /// + /// All the base learners will run asynchronously if the value is true + /// + public bool TrainParallel { get; set; } = false; + + /// + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set + /// + public bool ShowMetrics { get; set; } = false; + + /// + /// Output combiner + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleOutputCombiner(); + + /// + /// Algorithm to prune the base learners for selective Ensemble + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleSubModelSelector(); + + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(RegressionEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new RegressionEnsemblePipelineStep(output); + } + + private class RegressionEnsemblePipelineStep : ILearningPipelinePredictorStep + { + public RegressionEnsemblePipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + namespace Trainers { @@ -14251,6 +14620,413 @@ public sealed class UPEarlyStoppingCriterion : EarlyStoppingCriterion internal override string ComponentName => "UP"; } + public abstract class EnsembleDiversityMeasure : ComponentKind {} + + + + public sealed class DisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + { + internal override string ComponentName => "DisagreementDiversityMeasure"; + } + + + + public sealed class MultiDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + { + internal override string ComponentName => "MultiDisagreementDiversityMeasure"; + } + + + + public sealed class RegressionDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + { + internal override string ComponentName => "RegressionDisagreementDiversityMeasure"; + } + + public abstract class EnsembleFeatureSelector : ComponentKind {} + + + + public sealed class AllFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector + { + internal override string ComponentName => "AllFeatureSelector"; + } + + + + public sealed class RandomFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector + { + /// + /// The proportion of features to be selected. The range is 0.0-1.0 + /// + public float FeaturesSelectionProportion { get; set; } = 0.8f; + + internal override string ComponentName => "RandomFeatureSelector"; + } + + public abstract class EnsembleOutputCombiner : ComponentKind {} + + + + public sealed class AverageEnsembleOutputCombiner : EnsembleOutputCombiner + { + internal override string ComponentName => "Average"; + } + + + + public sealed class MedianEnsembleOutputCombiner : EnsembleOutputCombiner + { + internal override string ComponentName => "Median"; + } + + + + public sealed class MultiAverageEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// Whether to normalize the output of base models before combining them + /// + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiAverage"; + } + + + + public sealed class MultiMedianEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// Whether to normalize the output of base models before combining them + /// + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiMedian"; + } + + + + public sealed class MultiStackingEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "MultiStacking"; + } + + + + public sealed class MultiVotingEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// Whether to normalize the output of base models before combining them + /// + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiVoting"; + } + + public enum MultiWeightageKind + { + AccuracyMicroAvg = 0, + AccuracyMacroAvg = 1 + } + + + + public sealed class MultiWeightedAverageEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// The metric type to be used to find the weights for each model + /// + public MultiWeightageKind WeightageName { get; set; } = MultiWeightageKind.AccuracyMicroAvg; + + /// + /// Whether to normalize the output of base models before combining them + /// + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiWeightedAverage"; + } + + + + public sealed class RegressionStackingEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "RegressionStacking"; + } + + + + public sealed class StackingEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "Stacking"; + } + + + + public sealed class VotingEnsembleOutputCombiner : EnsembleOutputCombiner + { + internal override string ComponentName => "Voting"; + } + + public enum WeightageKind + { + Accuracy = 0, + Auc = 1, + PosPrecision = 2, + PosRecall = 3, + NegPrecision = 4, + NegRecall = 5 + } + + + + public sealed class WeightedAverageEnsembleOutputCombiner : EnsembleOutputCombiner + { + /// + /// The metric type to be used to find the weights for each model + /// + public WeightageKind WeightageName { get; set; } = WeightageKind.Auc; + + internal override string ComponentName => "WeightedAverage"; + } + + public abstract class EnsembleSubModelSelector : ComponentKind {} + + + + public sealed class AllSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + { + internal override string ComponentName => "AllSelector"; + } + + + + public sealed class AllSelectorMultiClassEnsembleSubModelSelector : EnsembleSubModelSelector + { + internal override string ComponentName => "AllSelectorMultiClass"; + } + + + + public sealed class BestDiverseSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + { + /// + /// The metric type to be used to find the diversity among base learners + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestDiverseSelector"; + } + + + + public sealed class BestDiverseSelectorMultiClassEnsembleSubModelSelector : EnsembleSubModelSelector + { + /// + /// The metric type to be used to find the diversity among base learners + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestDiverseSelectorMultiClass"; + } + + + + public sealed class BestDiverseSelectorRegressionEnsembleSubModelSelector : EnsembleSubModelSelector + { + /// + /// The metric type to be used to find the diversity among base learners + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestDiverseSelectorRegression"; + } + + public enum RegressionEvaluatorMetrics + { + L1 = 0, + L2 = 1, + Rms = 2, + Loss = 3, + RSquared = 4 + } + + + + public sealed class BestPerformanceRegressionSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + { + /// + /// The metric type to be used to find the best performance + /// + public RegressionEvaluatorMetrics MetricName { get; set; } = RegressionEvaluatorMetrics.L1; + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestPerformanceRegressionSelector"; + } + + public enum BinaryClassifierEvaluatorMetrics + { + Accuracy = 0, + PosPrecName = 1, + PosRecallName = 2, + NegPrecName = 3, + NegRecallName = 4, + Auc = 5, + LogLoss = 6, + LogLossReduction = 7, + F1 = 8, + AuPrc = 9 + } + + + + public sealed class BestPerformanceSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + { + /// + /// The metric type to be used to find the best performance + /// + public BinaryClassifierEvaluatorMetrics MetricName { get; set; } = BinaryClassifierEvaluatorMetrics.Auc; + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestPerformanceSelector"; + } + + public enum MultiClassClassifierEvaluatorMetrics + { + AccuracyMicro = 0, + AccuracyMacro = 1, + LogLoss = 2, + LogLossReduction = 3 + } + + + + public sealed class BestPerformanceSelectorMultiClassEnsembleSubModelSelector : EnsembleSubModelSelector + { + /// + /// The metric type to be used to find the best performance + /// + public MultiClassClassifierEvaluatorMetrics MetricName { get; set; } = MultiClassClassifierEvaluatorMetrics.AccuracyMicro; + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestPerformanceSelectorMultiClass"; + } + + public abstract class EnsembleSubsetSelector : ComponentKind {} + + + + public sealed class AllInstanceSelectorEnsembleSubsetSelector : EnsembleSubsetSelector + { + /// + /// The Feature selector + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleFeatureSelector FeatureSelector { get; set; } = new AllFeatureSelectorEnsembleFeatureSelector(); + + internal override string ComponentName => "AllInstanceSelector"; + } + + + + public sealed class BootstrapSelectorEnsembleSubsetSelector : EnsembleSubsetSelector + { + /// + /// The Feature selector + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleFeatureSelector FeatureSelector { get; set; } = new AllFeatureSelectorEnsembleFeatureSelector(); + + internal override string ComponentName => "BootstrapSelector"; + } + + + + public sealed class RandomPartitionSelectorEnsembleSubsetSelector : EnsembleSubsetSelector + { + /// + /// The Feature selector + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleFeatureSelector FeatureSelector { get; set; } = new AllFeatureSelectorEnsembleFeatureSelector(); + + internal override string ComponentName => "RandomPartitionSelector"; + } + public abstract class FastTreeTrainer : ComponentKind {} diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 16925eec02..10678ace1b 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -36,6 +36,9 @@ Models.Summarizer Summarize a linear regression predictor. Microsoft.ML.Runtime. Models.SweepResultExtractor Extracts the sweep result. Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro ExtractSweepResult Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+ResultInput Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+Output Models.TrainTestBinaryEvaluator Train test for binary classification Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro TrainTestBinary Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output] Models.TrainTestEvaluator General train test for any supported evaluator Microsoft.ML.Runtime.EntryPoints.TrainTestMacro TrainTest Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output] +Trainer.BinaryEnsemble Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput +Trainer.ClassificationEnsemble Train multiclass ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateMultiClassEnsemble Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput +Trainer.RegressionEnsemble Train regression ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateRegressionEnsemble Microsoft.ML.Runtime.Ensemble.RegressionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.AveragedPerceptronBinaryClassifier Train a Average perceptron. Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer TrainBinary Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.FastForestBinaryClassifier Uses a random forest learner to perform binary classification. Microsoft.ML.Runtime.FastTree.FastForest TrainBinary Microsoft.ML.Runtime.FastTree.FastForestClassification+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.FastForestRegressor Trains a random forest to fit target values using least-squares. Microsoft.ML.Runtime.FastTree.FastForest TrainRegression Microsoft.ML.Runtime.FastTree.FastForestRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 21f2427b72..dc1692802b 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -3718,10 +3718,10 @@ ] }, { - "Name": "Trainers.AveragedPerceptronBinaryClassifier", - "Desc": "Train a Average perceptron.", - "FriendlyName": "Averaged Perceptron", - "ShortName": "ap", + "Name": "Trainer.BinaryEnsemble", + "Desc": "Train binary ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, "Inputs": [ { "Name": "TrainingData", @@ -3734,6 +3734,28 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", + "Aliases": [ + "st" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } + } + }, { "Name": "FeatureColumn", "Type": "String", @@ -3746,6 +3768,18 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "NumModels", + "Type": "Int", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", + "Aliases": [ + "nm" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": true, + "Default": null + }, { "Name": "LabelColumn", "Type": "String", @@ -3758,6 +3792,40 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", + "Aliases": [ + "pt" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": { + "Name": "AllSelector" + } + }, + { + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleOutputCombiner" + }, + "Desc": "Output combiner", + "Aliases": [ + "oc" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": { + "Name": "Median" + } + }, { "Name": "NormalizeFeatures", "Type": { @@ -3799,253 +3867,242 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "ClassificationLossFunction" - }, - "Desc": "Loss Function", + "Name": "TrainParallel", + "Type": "Bool", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "loss" + "tp" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": { - "Name": "HingeLoss" - } + "Default": false }, { - "Name": "LearningRate", - "Type": "Float", - "Desc": "Learning rate", + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", "Aliases": [ - "lr" + "bs" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.01, - 0.1, - 0.5, - 1.0 - ] - } + "Default": -1 }, { - "Name": "DecreaseLearningRate", + "Name": "ShowMetrics", "Type": "Bool", - "Desc": "Decrease learning rate", - "Aliases": [ - "decreaselr" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } - }, - { - "Name": "L2RegularizerWeight", - "Type": "Float", - "Desc": "L2 Regularization Weight", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "reg" + "sm" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 0.5 - } - }, + "Default": false + } + ], + "Outputs": [ { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Number of iterations", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 100, - "StepSize": 10.0, - "IsLogScale": true - } - }, + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainer.ClassificationEnsemble", + "Desc": "Train multiclass ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ { - "Name": "InitWtsDiameter", - "Type": "Float", - "Desc": "Init weights diameter", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "initwts" + "data" ], - "Required": false, - "SortOrder": 140.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "Calibrator", + "Name": "SamplingType", "Type": { "Kind": "Component", - "ComponentKind": "CalibratorTrainer" + "ComponentKind": "EnsembleSubsetSelector" }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Desc": "Sampling Type", + "Aliases": [ + "st" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, "Default": { - "Name": "PlattCalibrator" + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 1000000 + "Default": "Features" }, { - "Name": "ResetWeightsAfterXExamples", + "Name": "NumModels", "Type": "Int", - "Desc": "Number of examples after which weights will be reset to the current average", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "numreset" + "nm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": true, "Default": null }, { - "Name": "DoLazyUpdates", - "Type": "Bool", - "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", - "Aliases": [ - "lazy" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "RecencyGain", - "Type": "Float", - "Desc": "Extra weight given to more recent updates", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "rg" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 0.0 + "Default": "Label" }, { - "Name": "RecencyGainMulti", - "Type": "Bool", - "Desc": "Whether Recency Gain is multiplicative (vs. additive)", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "rgm" + "pt" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "AllSelectorMultiClass" + } }, { - "Name": "Averaged", - "Type": "Bool", - "Desc": "Do averaging?", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "avg" + "oc" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": true + "Default": { + "Name": "MultiMedian" + } }, { - "Name": "AveragedTolerance", - "Type": "Float", - "Desc": "The inexactness tolerance for averaging", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "avgtol" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.01 + "Default": "Auto" }, { - "Name": "InitialWeights", - "Type": "String", - "Desc": "Initial Weights and bias, comma-separated", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "initweights" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": null + "Default": "Auto" }, { - "Name": "Shuffle", + "Name": "TrainParallel", "Type": "Bool", - "Desc": "Whether to shuffle for each training iteration", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "shuf" + "tp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": false }, { - "Name": "StreamingCacheSize", + "Name": "BatchSize", "Type": "Int", - "Desc": "Size of cache when trained in Scope", + "Desc": "Batch size", "Aliases": [ - "cache" + "bs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 1000000 + "Default": -1 + }, + { + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", + "Aliases": [ + "sm" + ], + "Required": false, + "SortOrder": 108.0, + "IsNullable": false, + "Default": false } ], "Outputs": [ @@ -4060,36 +4117,16 @@ "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IMulticlassClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastForestBinaryClassifier", - "Desc": "Uses a random forest learner to perform binary classification.", - "FriendlyName": "Fast Forest Classification", - "ShortName": "ff", + "Name": "Trainer.RegressionEnsemble", + "Desc": "Train regression ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, "Inputs": [ - { - "Name": "NumTrees", - "Type": "Int", - "Desc": "Number of weak hypotheses in the ensemble", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -4102,22 +4139,25 @@ "IsNullable": false }, { - "Name": "NumLeaves", - "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", "Aliases": [ - "nl" + "st" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { @@ -4133,24 +4173,16 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "NumModels", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "mil" + "nm" ], "Required": false, "SortOrder": 3.0, - "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "IsNullable": true, + "Default": null }, { "Name": "LabelColumn", @@ -4165,28 +4197,38 @@ "Default": "Label" }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "weight" + "pt" ], "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": { + "Name": "AllSelector" + } }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "groupId" + "oc" ], "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": { + "Name": "Median" + } }, { "Name": "NormalizeFeatures", @@ -4229,472 +4271,387 @@ "Default": "Auto" }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "TrainParallel", + "Type": "Bool", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "mo" + "tp" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100.0 - }, - { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } - }, - { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", - "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": 1000000 + "Default": false }, { - "Name": "QuantileSampleCount", + "Name": "BatchSize", "Type": "Int", - "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", + "Desc": "Batch size", "Aliases": [ - "qsc" + "bs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 100 + "Default": -1 }, { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "parag" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": { - "Name": "Single" - } - }, + "Default": false + } + ], + "Outputs": [ { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.AveragedPerceptronBinaryClassifier", + "Desc": "Train a Average perceptron.", + "FriendlyName": "Averaged Perceptron", + "ShortName": "ap", + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "t" + "data" ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "r1" + "feat" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 123 + "Default": "Features" }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "r3" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": "Label" }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "e" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.0 + "Default": "Auto" }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "ps" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": -1 + "Default": "Auto" }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "ClassificationLossFunction" + }, + "Desc": "Loss Function", "Aliases": [ - "dt" + "loss" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 50.0, + "IsNullable": false, + "Default": { + "Name": "HingeLoss" + } }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "LearningRate", + "Type": "Float", + "Desc": "Learning rate", "Aliases": [ - "flocks" + "lr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": true + "Default": 1.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.1, + 0.5, + 1.0 + ] + } }, { - "Name": "CategoricalSplit", + "Name": "DecreaseLearningRate", "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Desc": "Decrease learning rate", "Aliases": [ - "cat" + "decreaselr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": false + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Name": "L2RegularizerWeight", + "Type": "Float", + "Desc": "L2 Regularization Weight", "Aliases": [ - "mcg" + "reg" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 64 + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 0.5 + } }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "NumIterations", "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Desc": "Number of iterations", "Aliases": [ - "maxcat" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 64 + "Default": 1, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 100, + "StepSize": 10.0, + "IsLogScale": true + } }, { - "Name": "MinDocsPercentageForCategoricalSplit", + "Name": "InitWtsDiameter", "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Desc": "Init weights diameter", "Aliases": [ - "mdop" + "initwts" ], "Required": false, + "SortOrder": 140.0, + "IsNullable": false, + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 + } + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001 + "Default": { + "Name": "PlattCalibrator" + } }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MaxCalibrationExamples", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", - "Aliases": [ - "mdo" - ], + "Desc": "The maximum number of examples to use when training the calibrator", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 1000000 }, { - "Name": "Bias", - "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Name": "ResetWeightsAfterXExamples", + "Type": "Int", + "Desc": "Number of examples after which weights will be reset to the current average", "Aliases": [ - "bias" + "numreset" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 + "IsNullable": true, + "Default": null }, { - "Name": "Bundling", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" - ] - }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Name": "DoLazyUpdates", + "Type": "Bool", + "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", "Aliases": [ - "bundle" + "lazy" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "None" + "Default": true }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "RecencyGain", + "Type": "Float", + "Desc": "Extra weight given to more recent updates", "Aliases": [ - "mb" + "rg" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": 0.0 }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "RecencyGainMulti", + "Type": "Bool", + "Desc": "Whether Recency Gain is multiplicative (vs. additive)", "Aliases": [ - "sp" + "rgm" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": false }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "Averaged", + "Type": "Bool", + "Desc": "Do averaging?", "Aliases": [ - "ffup" + "avg" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": true }, { - "Name": "FeatureReusePenalty", + "Name": "AveragedTolerance", "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Desc": "The inexactness tolerance for averaging", "Aliases": [ - "frup" + "avgtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.01 }, { - "Name": "GainConfidenceLevel", - "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "InitialWeights", + "Type": "String", + "Desc": "Initial Weights and bias, comma-separated", "Aliases": [ - "gainconf" + "initweights" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": null }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", - "Aliases": [ - "smtemp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", - "Aliases": [ - "et" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "FeatureFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", - "Aliases": [ - "ff" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "BaggingSize", - "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", - "Aliases": [ - "bag" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1 - }, - { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", - "Aliases": [ - "bagfrac" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "SplitFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", - "Aliases": [ - "sf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", - "Aliases": [ - "s" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "AllowEmptyTrees", - "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", - "Aliases": [ - "allowempty", - "dummies" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", - "Aliases": [ - "fcomp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1 - }, - { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", - "Aliases": [ - "cmp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, - { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", - "Aliases": [ - "graph" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "PrintTrainValidGraph", + "Name": "Shuffle", "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Desc": "Whether to shuffle for each training iteration", "Aliases": [ - "graphtv" + "shuf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "TestFrequency", + "Name": "StreamingCacheSize", "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Desc": "Size of cache when trained in Scope", "Aliases": [ - "tf" + "cache" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 + "Default": 1000000 } ], "Outputs": [ @@ -4705,8 +4662,6 @@ } ], "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -4716,10 +4671,10 @@ ] }, { - "Name": "Trainers.FastForestRegressor", - "Desc": "Trains a random forest to fit target values using least-squares.", - "FriendlyName": "FastForestRegression", - "ShortName": "ffr", + "Name": "Trainers.FastForestBinaryClassifier", + "Desc": "Uses a random forest learner to perform binary classification.", + "FriendlyName": "Fast Forest Classification", + "ShortName": "ff", "Inputs": [ { "Name": "NumTrees", @@ -4880,13 +4835,39 @@ "Default": "Auto" }, { - "Name": "ShuffleLabels", - "Type": "Bool", - "Desc": "Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass.", + "Name": "MaxTreeOutput", + "Type": "Float", + "Desc": "Upper bound on absolute value of single tree output", + "Aliases": [ + "mo" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 100.0 + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 }, { "Name": "QuantileSampleCount", @@ -5336,15 +5317,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeBinaryClassifier", - "Desc": "Uses a logit-boost boosted tree learner to perform binary classification.", - "FriendlyName": "FastTree (Boosted Trees) Classification", - "ShortName": "ftc", + "Name": "Trainers.FastForestRegressor", + "Desc": "Trains a random forest to fit target values using least-squares.", + "FriendlyName": "FastForestRegression", + "ShortName": "ffr", "Inputs": [ { "Name": "NumTrees", @@ -5440,24 +5421,6 @@ "IsNullable": false, "Default": "Label" }, - { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", - "Aliases": [ - "lr" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": 0.2, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 0.4, - "IsLogScale": true - } - }, { "Name": "WeightColumn", "Type": "String", @@ -5523,570 +5486,301 @@ "Default": "Auto" }, { - "Name": "UnbalancedSets", + "Name": "ShuffleLabels", "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", - "Aliases": [ - "us" - ], + "Desc": "Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass.", "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": false }, { - "Name": "BestStepRankingRegressionTrees", - "Type": "Bool", - "Desc": "Use best regression step trees?", + "Name": "QuantileSampleCount", + "Type": "Int", + "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", "Aliases": [ - "bsr" + "qsc" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 100 }, { - "Name": "UseLineSearch", - "Type": "Bool", - "Desc": "Should we use line search for a step size", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "ls" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "NumPostBracketSteps", + "Name": "NumThreads", "Type": "Int", - "Desc": "Number of post-bracket line search steps", + "Desc": "The number of threads to use", "Aliases": [ - "lssteps" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 + "IsNullable": true, + "Default": null }, { - "Name": "MinStepSize", - "Type": "Float", - "Desc": "Minimum line search step size", + "Name": "RngSeed", + "Type": "Int", + "Desc": "The seed of the random number generator", "Aliases": [ - "minstep" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 123 }, { - "Name": "OptimizationAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "GradientDescent", - "AcceleratedGradientDescent", - "ConjugateGradientDescent" - ] - }, - "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", + "Name": "FeatureSelectSeed", + "Type": "Int", + "Desc": "The seed of the active feature selection", "Aliases": [ - "oa" + "r3" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "GradientDescent" + "Default": 123 }, { - "Name": "EarlyStoppingRule", - "Type": { - "Kind": "Component", - "ComponentKind": "EarlyStoppingCriterion" - }, - "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "esr" + "e" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "EarlyStoppingMetrics", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "esmt" + "ps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": -1 }, { - "Name": "EnablePruning", + "Name": "DiskTranspose", "Type": "Bool", - "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "pruning" + "dt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "UseTolerantPruning", + "Name": "FeatureFlocks", "Type": "Bool", - "Desc": "Use window and tolerance for pruning", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "prtol" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true }, { - "Name": "PruningThreshold", - "Type": "Float", - "Desc": "The tolerance threshold for pruning", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "prth" + "cat" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.004 + "Default": false }, { - "Name": "PruningWindowSize", + "Name": "MaxCategoricalGroupsPerNode", "Type": "Int", - "Desc": "The moving window size for pruning", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "prws" + "mcg" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 5 + "Default": 64 }, { - "Name": "Shrinkage", - "Type": "Float", - "Desc": "Shrinkage", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "shrk" + "maxcat" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 4.0, - "IsLogScale": true - } + "Default": 64 }, { - "Name": "DropoutRate", + "Name": "MinDocsPercentageForCategoricalSplit", "Type": "Float", - "Desc": "Dropout rate for tree regularization", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "tdrop" + "mdop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 1E-09, - 0.05, - 0.1, - 0.2 - ] - } + "Default": 0.001 }, { - "Name": "GetDerivativesSampleRate", + "Name": "MinDocsForCategoricalSplit", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "sr" + "mdo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 100 }, { - "Name": "WriteLastEnsemble", - "Type": "Bool", - "Desc": "Write the last ensemble instead of the one determined by early stopping", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "hl" + "bias" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "Bundling", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "AggregateLowPopulation", + "Adjacent" + ] + }, + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "mo" + "bundle" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100.0 + "Default": "None" }, { - "Name": "RandomStart", - "Type": "Bool", - "Desc": "Training starts from random ordering (determined by /r1)", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "rs" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 255 }, { - "Name": "FilterZeroLambdas", - "Type": "Bool", - "Desc": "Filter zero lambdas during training", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "fzl" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.7 }, { - "Name": "BaselineScoresFormula", - "Type": "String", - "Desc": "Freeform defining the scores that should be used as the baseline ranker", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "basescores" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "BaselineAlphaRisk", - "Type": "String", - "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "basealpha" + "frup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "PositionDiscountFreeform", - "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "pdff" + "gainconf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "parag" + "smtemp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "Single" - } + "Default": 0.0 }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "t" + "et" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", - "Aliases": [ - "r1" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 123 - }, - { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", - "Aliases": [ - "r3" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 123 - }, - { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", - "Aliases": [ - "e" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", - "Aliases": [ - "ps" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, - { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", - "Aliases": [ - "dt" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", - "Aliases": [ - "flocks" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "CategoricalSplit", - "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", - "Aliases": [ - "cat" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", - "Aliases": [ - "mcg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 64 - }, - { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", - "Aliases": [ - "maxcat" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 64 - }, - { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", - "Aliases": [ - "mdop" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.001 - }, - { - "Name": "MinDocsForCategoricalSplit", - "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", - "Aliases": [ - "mdo" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100 - }, - { - "Name": "Bias", - "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", - "Aliases": [ - "bias" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "Bundling", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" - ] - }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", - "Aliases": [ - "bundle" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "None" - }, - { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", - "Aliases": [ - "mb" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 255 - }, - { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", - "Aliases": [ - "sp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", - "Aliases": [ - "ffup" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", - "Aliases": [ - "frup" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "GainConfidenceLevel", - "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", - "Aliases": [ - "gainconf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", - "Aliases": [ - "smtemp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", - "Aliases": [ - "et" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": false, + "Default": false }, { "Name": "FeatureFraction", @@ -6098,7 +5792,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.7 }, { "Name": "BaggingSize", @@ -6110,7 +5804,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "BaggingTrainFraction", @@ -6134,7 +5828,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.7 }, { "Name": "Smoothing", @@ -6248,15 +5942,15 @@ "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeRanker", - "Desc": "Trains gradient boosted decision trees to the LambdaRank quasi-gradient.", - "FriendlyName": "FastTree (Boosted Trees) Ranking", - "ShortName": "ftrank", + "Name": "Trainers.FastTreeBinaryClassifier", + "Desc": "Uses a logit-boost boosted tree learner to perform binary classification.", + "FriendlyName": "FastTree (Boosted Trees) Classification", + "ShortName": "ftc", "Inputs": [ { "Name": "NumTrees", @@ -6435,92 +6129,11 @@ "Default": "Auto" }, { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", - "Aliases": [ - "gains" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "0,3,7,15,31" - }, - { - "Name": "TrainDcg", - "Type": "Bool", - "Desc": "Train DCG instead of NDCG", - "Aliases": [ - "dcg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "SortingAlgorithm", - "Type": "String", - "Desc": "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", - "Aliases": [ - "sort" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "DescendingStablePessimistic" - }, - { - "Name": "LambdaMartMaxTruncation", - "Type": "Int", - "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", - "Aliases": [ - "n" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100 - }, - { - "Name": "ShiftedNdcg", - "Type": "Bool", - "Desc": "Use shifted NDCG", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "CostFunctionParam", - "Type": "Char", - "Desc": "Cost function parameter (w/c)", - "Aliases": [ - "cf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "w" - }, - { - "Name": "DistanceWeight2", - "Type": "Bool", - "Desc": "Distance weight 2 adjustment to cost", - "Aliases": [ - "dw" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "NormalizeQueryLambdas", + "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Normalize query lambdas", + "Desc": "Should we use derivatives optimized for unbalanced sets", "Aliases": [ - "nql" + "us" ], "Required": false, "SortOrder": 150.0, @@ -6619,7 +6232,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 0 }, { "Name": "EnablePruning", @@ -7241,15 +6854,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRankingOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeRegressor", - "Desc": "Trains gradient boosted decision trees to fit target values using least-squares.", - "FriendlyName": "FastTree (Boosted Trees) Regression", - "ShortName": "ftr", + "Name": "Trainers.FastTreeRanker", + "Desc": "Trains gradient boosted decision trees to the LambdaRank quasi-gradient.", + "FriendlyName": "FastTree (Boosted Trees) Ranking", + "ShortName": "ftrank", "Inputs": [ { "Name": "NumTrees", @@ -7427,6 +7040,99 @@ "IsNullable": false, "Default": "Auto" }, + { + "Name": "CustomGains", + "Type": "String", + "Desc": "Comma seperated list of gains associated to each relevance label.", + "Aliases": [ + "gains" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "0,3,7,15,31" + }, + { + "Name": "TrainDcg", + "Type": "Bool", + "Desc": "Train DCG instead of NDCG", + "Aliases": [ + "dcg" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "SortingAlgorithm", + "Type": "String", + "Desc": "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", + "Aliases": [ + "sort" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "DescendingStablePessimistic" + }, + { + "Name": "LambdaMartMaxTruncation", + "Type": "Int", + "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", + "Aliases": [ + "n" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 100 + }, + { + "Name": "ShiftedNdcg", + "Type": "Bool", + "Desc": "Use shifted NDCG", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "CostFunctionParam", + "Type": "Char", + "Desc": "Cost function parameter (w/c)", + "Aliases": [ + "cf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "w" + }, + { + "Name": "DistanceWeight2", + "Type": "Bool", + "Desc": "Distance weight 2 adjustment to cost", + "Aliases": [ + "dw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "NormalizeQueryLambdas", + "Type": "Bool", + "Desc": "Normalize query lambdas", + "Aliases": [ + "nql" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", @@ -8141,15 +7847,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IRankingOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeTweedieRegressor", - "Desc": "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.", - "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", - "ShortName": "fttweedie", + "Name": "Trainers.FastTreeRegressor", + "Desc": "Trains gradient boosted decision trees to fit target values using least-squares.", + "FriendlyName": "FastTree (Boosted Trees) Regression", + "ShortName": "ftr", "Inputs": [ { "Name": "NumTrees", @@ -8327,15 +8033,6 @@ "IsNullable": false, "Default": "Auto" }, - { - "Name": "Index", - "Type": "Float", - "Desc": "Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.5 - }, { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", @@ -8428,7 +8125,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -9055,28 +8752,28 @@ ] }, { - "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", - "FriendlyName": "Generalized Additive Model for Binary Classification", - "ShortName": "gam", + "Name": "Trainers.FastTreeTweedieRegressor", + "Desc": "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.", + "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", + "ShortName": "fttweedie", "Inputs": [ { - "Name": "NumIterations", + "Name": "NumTrees", "Type": "Int", - "Desc": "Total number of iterations over all features", + "Desc": "Number of weak hypotheses in the ensemble", "Aliases": [ "iter" ], "Required": false, "SortOrder": 1.0, "IsNullable": false, - "Default": 9500, + "Default": 100, "SweepRange": { "RangeType": "Discrete", "Values": [ - 200, - 1500, - 9500 + 20, + 100, + 500 ] } }, @@ -9092,23 +8789,42 @@ "IsNullable": false }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "NumLeaves", + "Type": "Int", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "feat" + "nl" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": "Features" - }, - { - "Name": "MinDocuments", - "Type": "Int", - "Desc": "Minimum number of training instances required to form a partition", - "Aliases": [ - "mi" + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "MinDocumentsInLeafs", + "Type": "Int", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Aliases": [ + "mil" ], "Required": false, "SortOrder": 3.0, @@ -9145,11 +8861,11 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": 0.002, + "Default": 0.2, "SweepRange": { "RangeType": "Float", - "Min": 0.001, - "Max": 0.1, + "Min": 0.025, + "Max": 0.4, "IsLogScale": true } }, @@ -9165,6 +8881,18 @@ "IsNullable": false, "Default": "Weight" }, + { + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", + "Aliases": [ + "groupId" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "GroupId" + }, { "Name": "NormalizeFeatures", "Type": { @@ -9206,11 +8934,20 @@ "Default": "Auto" }, { - "Name": "UnbalancedSets", + "Name": "Index", + "Type": "Float", + "Desc": "Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.5 + }, + { + "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", + "Desc": "Use best regression step trees?", "Aliases": [ - "us" + "bsr" ], "Required": false, "SortOrder": 150.0, @@ -9218,34 +8955,35 @@ "Default": false }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "UseLineSearch", + "Type": "Bool", + "Desc": "Should we use line search for a step size", + "Aliases": [ + "ls" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": false }, { - "Name": "MaxCalibrationExamples", + "Name": "NumPostBracketSteps", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "Number of post-bracket line search steps", + "Aliases": [ + "lssteps" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": 0 }, { - "Name": "EntropyCoefficient", + "Name": "MinStepSize", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "Minimum line search step size", "Aliases": [ - "e" + "minstep" ], "Required": false, "SortOrder": 150.0, @@ -9253,293 +8991,251 @@ "Default": 0.0 }, { - "Name": "GainConfidenceLevel", - "Type": "Int", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "OptimizationAlgorithm", + "Type": { + "Kind": "Enum", + "Values": [ + "GradientDescent", + "AcceleratedGradientDescent", + "ConjugateGradientDescent" + ] + }, + "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", "Aliases": [ - "gainconf" + "oa" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": "GradientDescent" }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "EarlyStoppingRule", + "Type": { + "Kind": "Component", + "ComponentKind": "EarlyStoppingCriterion" + }, + "Desc": "Early stopping rule. (Validation set (/valid) is required.)", "Aliases": [ - "t" + "esr" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "EarlyStoppingMetrics", + "Type": "Int", + "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", "Aliases": [ - "dt" + "esmt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0 }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "EnablePruning", + "Type": "Bool", + "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", "Aliases": [ - "mb" + "pruning" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": false }, { - "Name": "MaxOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single output", + "Name": "UseTolerantPruning", + "Type": "Bool", + "Desc": "Use window and tolerance for pruning", "Aliases": [ - "mo" + "prtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Infinity" + "Default": false }, { - "Name": "GetDerivativesSampleRate", - "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Name": "PruningThreshold", + "Type": "Float", + "Desc": "The tolerance threshold for pruning", "Aliases": [ - "sr" + "prth" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 0.004 }, { - "Name": "RngSeed", + "Name": "PruningWindowSize", "Type": "Int", - "Desc": "The seed of the random number generator", + "Desc": "The moving window size for pruning", "Aliases": [ - "r1" + "prws" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": 5 }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "Shrinkage", + "Type": "Float", + "Desc": "Shrinkage", "Aliases": [ - "flocks" + "shrk" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IBinaryClassificationOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.GeneralizedAdditiveModelRegressor", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", - "FriendlyName": "Generalized Additive Model for Regression", - "ShortName": "gamr", - "Inputs": [ + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 4.0, + "IsLogScale": true + } + }, { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Total number of iterations over all features", + "Name": "DropoutRate", + "Type": "Float", + "Desc": "Dropout rate for tree regularization", "Aliases": [ - "iter" + "tdrop" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 9500, + "Default": 0.0, "SweepRange": { "RangeType": "Discrete", "Values": [ - 200, - 1500, - 9500 + 0.0, + 1E-09, + 0.05, + 0.1, + 0.2 ] } }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "GetDerivativesSampleRate", + "Type": "Int", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "data" + "sr" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "WriteLastEnsemble", + "Type": "Bool", + "Desc": "Write the last ensemble instead of the one determined by early stopping", "Aliases": [ - "feat" + "hl" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": false }, { - "Name": "MinDocuments", - "Type": "Int", - "Desc": "Minimum number of training instances required to form a partition", + "Name": "MaxTreeOutput", + "Type": "Float", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "mi" + "mo" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "Default": 100.0 }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "RandomStart", + "Type": "Bool", + "Desc": "Training starts from random ordering (determined by /r1)", "Aliases": [ - "lab" + "rs" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": false }, { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", + "Name": "FilterZeroLambdas", + "Type": "Bool", + "Desc": "Filter zero lambdas during training", "Aliases": [ - "lr" + "fzl" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.002, - "SweepRange": { - "RangeType": "Float", - "Min": 0.001, - "Max": 0.1, - "IsLogScale": true - } + "Default": false }, { - "Name": "WeightColumn", + "Name": "BaselineScoresFormula", "Type": "String", - "Desc": "Column to use for example weight", - "Aliases": [ - "weight" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": "Weight" - }, - { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Desc": "Freeform defining the scores that should be used as the baseline ranker", "Aliases": [ - "norm" + "basescores" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": null }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "BaselineAlphaRisk", + "Type": "String", + "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", "Aliases": [ - "cache" + "basealpha" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": null }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "PositionDiscountFreeform", + "Type": "String", + "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ - "e" + "pdff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": null }, { - "Name": "GainConfidenceLevel", - "Type": "Int", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "gainconf" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": { + "Name": "Single" + } }, { "Name": "NumThreads", @@ -9554,64 +9250,64 @@ "Default": null }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "RngSeed", + "Type": "Int", + "Desc": "The seed of the random number generator", "Aliases": [ - "dt" + "r1" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 123 }, { - "Name": "MaxBins", + "Name": "FeatureSelectSeed", "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Desc": "The seed of the active feature selection", "Aliases": [ - "mb" + "r3" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": 123 }, { - "Name": "MaxOutput", + "Name": "EntropyCoefficient", "Type": "Float", - "Desc": "Upper bound on absolute value of single output", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "mo" + "e" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Infinity" + "Default": 0.0 }, { - "Name": "GetDerivativesSampleRate", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "sr" + "ps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": -1 }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "r1" + "dt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 123 + "IsNullable": true, + "Default": null }, { "Name": "FeatureFlocks", @@ -9624,459 +9320,326 @@ "SortOrder": 150.0, "IsNullable": false, "Default": true - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRegressionOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.KMeansPlusPlusClusterer", - "Desc": "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.", - "FriendlyName": "KMeans++ Clustering", - "ShortName": "KM", - "Inputs": [ - { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "feat" + "cat" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": false }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MaxCategoricalGroupsPerNode", + "Type": "Int", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "weight" + "mcg" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 64 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "norm" + "maxcat" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 64 }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "MinDocsPercentageForCategoricalSplit", + "Type": "Float", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "cache" + "mdop" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.001 }, { - "Name": "K", + "Name": "MinDocsForCategoricalSplit", "Type": "Int", - "Desc": "The number of clusters", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Aliases": [ + "mdo" + ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 5, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 10, - 20, - 40 - ] - } + "Default": 100 }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "nt", - "t", - "threads" + "bias" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": true, - "Default": null + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "InitAlgorithm", + "Name": "Bundling", "Type": { "Kind": "Enum", "Values": [ - "KMeansPlusPlus", - "Random", - "KMeansParallel" + "None", + "AggregateLowPopulation", + "Adjacent" ] }, - "Desc": "Cluster initialization algorithm", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "init" + "bundle" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "KMeansParallel" + "Default": "None" }, { - "Name": "OptTol", - "Type": "Float", - "Desc": "Tolerance parameter for trainer convergence. Lower = slower, more accurate", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "ot" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1E-07 + "Default": 255 }, { - "Name": "MaxIterations", - "Type": "Int", - "Desc": "Maximum number of iterations.", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "maxiter" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000 + "Default": 0.7 }, { - "Name": "AccelMemBudgetMb", - "Type": "Int", - "Desc": "Memory budget (in MBs) to use for KMeans acceleration", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "accelMemBudgetMb" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 4096 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "IUnsupervisedTrainerWithWeight", - "ITrainerInput" - ], - "OutputKind": [ - "IClusteringOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.LinearSvmBinaryClassifier", - "Desc": "Train a linear SVM.", - "FriendlyName": "SVM (Pegasos-Linear)", - "ShortName": "svm", - "Inputs": [ - { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Default": 0.0 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "feat" + "frup" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": 0.0 }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "lab" + "gainconf" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": 0.0 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "norm" + "smtemp" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.0 }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "cache" + "et" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": false }, { - "Name": "Lambda", + "Name": "FeatureFraction", "Type": "Float", - "Desc": "Regularizer constant", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "lambda" + "ff" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001, - "SweepRange": { - "RangeType": "Float", - "Min": 1E-05, - "Max": 0.1, - "StepSize": 10.0, - "IsLogScale": true - } + "Default": 1.0 }, { - "Name": "PerformProjection", - "Type": "Bool", - "Desc": "Perform projection to unit-ball? Typically used with batch size > 1.", + "Name": "BaggingSize", + "Type": "Int", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "project" + "bag" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": 0 }, { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Number of iterations", + "Name": "BaggingTrainFraction", + "Type": "Float", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "iter" + "bagfrac" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 100, - "StepSize": 10.0, - "IsLogScale": true - } + "Default": 0.7 }, { - "Name": "InitWtsDiameter", + "Name": "SplitFraction", "Type": "Float", - "Desc": "Init weights diameter", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "initwts" + "sf" ], "Required": false, - "SortOrder": 140.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } + "Default": 1.0 }, { - "Name": "NoBias", - "Type": "Bool", - "Desc": "No bias", + "Name": "Smoothing", + "Type": "Float", + "Desc": "Smoothing paramter for tree regularization", + "Aliases": [ + "s" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": 0.0 }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "AllowEmptyTrees", + "Type": "Bool", + "Desc": "When a root split is impossible, allow training to proceed", + "Aliases": [ + "allowempty", + "dummies" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": true }, { - "Name": "MaxCalibrationExamples", + "Name": "FeatureCompressionLevel", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "The level of feature compression to use", + "Aliases": [ + "fcomp" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": 1 }, { - "Name": "InitialWeights", - "Type": "String", - "Desc": "Initial Weights and bias, comma-separated", + "Name": "CompressEnsemble", + "Type": "Bool", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "initweights" + "cmp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "Shuffle", + "Name": "MaxTreesAfterCompression", + "Type": "Int", + "Desc": "Maximum Number of trees after compression", + "Aliases": [ + "cmpmax" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": -1 + }, + { + "Name": "PrintTestGraph", "Type": "Bool", - "Desc": "Whether to shuffle for each training iteration", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "shuf" + "graph" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": false }, { - "Name": "StreamingCacheSize", - "Type": "Int", - "Desc": "Size of cache when trained in Scope", + "Name": "PrintTrainValidGraph", + "Type": "Bool", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "cache" + "graphtv" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": false }, { - "Name": "BatchSize", + "Name": "TestFrequency", "Type": "Int", - "Desc": "Batch size", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "batch" + "tf" ], "Required": false, - "SortOrder": 190.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 2147483647 } ], "Outputs": [ @@ -10087,20 +9650,42 @@ } ], "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.LogisticRegressionBinaryClassifier", - "Desc": "Train a logistic regression binary model", - "FriendlyName": "Logistic Regression", - "ShortName": "lr", + "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "FriendlyName": "Generalized Additive Model for Binary Classification", + "ShortName": "gam", "Inputs": [ + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Total number of iterations over all features", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9500, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 200, + 1500, + 9500 + ] + } + }, { "Name": "TrainingData", "Type": "DataView", @@ -10124,6 +9709,26 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "MinDocuments", + "Type": "Int", + "Desc": "Minimum number of training instances required to form a partition", + "Aliases": [ + "mi" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } + }, { "Name": "LabelColumn", "Type": "String", @@ -10136,6 +9741,24 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", + "Aliases": [ + "lr" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 0.002, + "SweepRange": { + "RangeType": "Float", + "Min": 0.001, + "Max": 0.1, + "IsLogScale": true + } + }, { "Name": "WeightColumn", "Type": "String", @@ -10189,205 +9812,147 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStats", + "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Show statistics of training examples.", + "Desc": "Should we use derivatives optimized for unbalanced sets", "Aliases": [ - "stat" + "us" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, "Default": false }, { - "Name": "L2Weight", - "Type": "Float", - "Desc": "L2 regularization weight", - "Aliases": [ - "l2" - ], + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 + "Default": { + "Name": "PlattCalibrator" } }, { - "Name": "L1Weight", - "Type": "Float", - "Desc": "L1 regularization weight", - "Aliases": [ - "l1" - ], + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 - } + "Default": 1000000 }, { - "Name": "OptTol", + "Name": "EntropyCoefficient", "Type": "Float", - "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "ot" + "e" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1E-07, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0001, - 1E-07 - ] - } + "Default": 0.0 }, { - "Name": "MemorySize", + "Name": "GainConfidenceLevel", "Type": "Int", - "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "m" + "gainconf" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 20, - 50 - ] - } + "Default": 0 }, { - "Name": "EnforceNonNegativity", - "Type": "Bool", - "Desc": "Enforce non-negative weights", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "nn" + "t" ], "Required": false, - "SortOrder": 90.0, - "IsNullable": false, - "Default": false + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "InitWtsDiameter", - "Type": "Float", - "Desc": "Init weights diameter", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "initwts" + "dt" ], "Required": false, - "SortOrder": 140.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "MaxIterations", + "Name": "MaxBins", "Type": "Int", - "Desc": "Maximum iterations.", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "maxiter" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 2147483647 - } + "Default": 255 }, { - "Name": "SgdInitializationTolerance", + "Name": "MaxOutput", "Type": "Float", - "Desc": "Run SGD to initialize LR weights, converging to this tolerance", - "Aliases": [ - "sgd" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "Quiet", - "Type": "Bool", - "Desc": "If set to true, produce no output during training.", + "Desc": "Upper bound on absolute value of single output", "Aliases": [ - "q" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": "Infinity" }, { - "Name": "UseThreads", - "Type": "Bool", - "Desc": "Whether or not to use threads. Default is true", + "Name": "GetDerivativesSampleRate", + "Type": "Int", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "t" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1 }, { - "Name": "NumThreads", + "Name": "RngSeed", "Type": "Int", - "Desc": "Number of threads", + "Desc": "The seed of the random number generator", "Aliases": [ - "nt" + "r1" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 123 }, { - "Name": "DenseOptimizer", + "Name": "FeatureFlocks", "Type": "Bool", - "Desc": "Force densification of the internal optimization vectors", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "do" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": true } ], "Outputs": [ @@ -10408,11 +9973,31 @@ ] }, { - "Name": "Trainers.LogisticRegressionClassifier", - "Desc": "Train a logistic regression multi class model", - "FriendlyName": "Multi-class Logistic Regression", - "ShortName": "mlr", + "Name": "Trainers.GeneralizedAdditiveModelRegressor", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "FriendlyName": "Generalized Additive Model for Regression", + "ShortName": "gamr", "Inputs": [ + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Total number of iterations over all features", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9500, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 200, + 1500, + 9500 + ] + } + }, { "Name": "TrainingData", "Type": "DataView", @@ -10436,6 +10021,26 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "MinDocuments", + "Type": "Int", + "Desc": "Minimum number of training instances required to form a partition", + "Aliases": [ + "mi" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } + }, { "Name": "LabelColumn", "Type": "String", @@ -10448,6 +10053,24 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", + "Aliases": [ + "lr" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 0.002, + "SweepRange": { + "RangeType": "Float", + "Min": 0.001, + "Max": 0.1, + "IsLogScale": true + } + }, { "Name": "WeightColumn", "Type": "String", @@ -10501,205 +10124,112 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStats", - "Type": "Bool", - "Desc": "Show statistics of training examples.", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "stat" + "e" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "L2Weight", - "Type": "Float", - "Desc": "L2 regularization weight", + "Name": "GainConfidenceLevel", + "Type": "Int", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "l2" + "gainconf" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 - } + "Default": 0 }, { - "Name": "L1Weight", - "Type": "Float", - "Desc": "L1 regularization weight", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "l1" + "t" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "OptTol", - "Type": "Float", - "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "ot" + "dt" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1E-07, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0001, - 1E-07 - ] - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "MemorySize", + "Name": "MaxBins", "Type": "Int", - "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "m" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 20, - 50 - ] - } - }, - { - "Name": "EnforceNonNegativity", - "Type": "Bool", - "Desc": "Enforce non-negative weights", - "Aliases": [ - "nn" - ], - "Required": false, - "SortOrder": 90.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "InitWtsDiameter", - "Type": "Float", - "Desc": "Init weights diameter", - "Aliases": [ - "initwts" - ], - "Required": false, - "SortOrder": 140.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } - }, - { - "Name": "MaxIterations", - "Type": "Int", - "Desc": "Maximum iterations.", - "Aliases": [ - "maxiter" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 2147483647 - } + "Default": 255 }, { - "Name": "SgdInitializationTolerance", + "Name": "MaxOutput", "Type": "Float", - "Desc": "Run SGD to initialize LR weights, converging to this tolerance", - "Aliases": [ - "sgd" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "Quiet", - "Type": "Bool", - "Desc": "If set to true, produce no output during training.", + "Desc": "Upper bound on absolute value of single output", "Aliases": [ - "q" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": "Infinity" }, { - "Name": "UseThreads", - "Type": "Bool", - "Desc": "Whether or not to use threads. Default is true", + "Name": "GetDerivativesSampleRate", + "Type": "Int", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "t" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1 }, { - "Name": "NumThreads", + "Name": "RngSeed", "Type": "Int", - "Desc": "Number of threads", + "Desc": "The seed of the random number generator", "Aliases": [ - "nt" + "r1" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 123 }, { - "Name": "DenseOptimizer", + "Name": "FeatureFlocks", "Type": "Bool", - "Desc": "Force densification of the internal optimization vectors", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "do" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": true } ], "Outputs": [ @@ -10715,15 +10245,15 @@ "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.NaiveBayesClassifier", - "Desc": "Train a MultiClassNaiveBayesTrainer.", - "FriendlyName": "Multiclass Naive Bayes", - "ShortName": "MNB", + "Name": "Trainers.KMeansPlusPlusClusterer", + "Desc": "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.", + "FriendlyName": "KMeans++ Clustering", + "ShortName": "KM", "Inputs": [ { "Name": "TrainingData", @@ -10749,16 +10279,16 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "WeightColumn", "Type": "String", - "Desc": "Column to use for labels", + "Desc": "Column to use for example weight", "Aliases": [ - "lab" + "weight" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": "Label" + "Default": "Weight" }, { "Name": "NormalizeFeatures", @@ -10799,6 +10329,93 @@ "SortOrder": 6.0, "IsNullable": false, "Default": "Auto" + }, + { + "Name": "K", + "Type": "Int", + "Desc": "The number of clusters", + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 5, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 5, + 10, + 20, + 40 + ] + } + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Aliases": [ + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "InitAlgorithm", + "Type": { + "Kind": "Enum", + "Values": [ + "KMeansPlusPlus", + "Random", + "KMeansParallel" + ] + }, + "Desc": "Cluster initialization algorithm", + "Aliases": [ + "init" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "KMeansParallel" + }, + { + "Name": "OptTol", + "Type": "Float", + "Desc": "Tolerance parameter for trainer convergence. Lower = slower, more accurate", + "Aliases": [ + "ot" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1E-07 + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations.", + "Aliases": [ + "maxiter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000 + }, + { + "Name": "AccelMemBudgetMb", + "Type": "Int", + "Desc": "Memory budget (in MBs) to use for KMeans acceleration", + "Aliases": [ + "accelMemBudgetMb" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 4096 } ], "Outputs": [ @@ -10809,19 +10426,19 @@ } ], "InputKind": [ - "ITrainerInputWithLabel", + "IUnsupervisedTrainerWithWeight", "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IClusteringOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.OnlineGradientDescentRegressor", - "Desc": "Train a Online gradient descent perceptron.", - "FriendlyName": "Stochastic Gradient Descent (Regression)", - "ShortName": "ogd", + "Name": "Trainers.LinearSvmBinaryClassifier", + "Desc": "Train a linear SVM.", + "FriendlyName": "SVM (Pegasos-Linear)", + "ShortName": "svm", "Inputs": [ { "Name": "TrainingData", @@ -10899,54 +10516,35 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "RegressionLossFunction" - }, - "Desc": "Loss Function", + "Name": "Lambda", + "Type": "Float", + "Desc": "Regularizer constant", "Aliases": [ - "loss" + "lambda" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": { - "Name": "SquaredLoss" + "Default": 0.001, + "SweepRange": { + "RangeType": "Float", + "Min": 1E-05, + "Max": 0.1, + "StepSize": 10.0, + "IsLogScale": true } }, { - "Name": "LearningRate", - "Type": "Float", - "Desc": "Learning rate", - "Aliases": [ - "lr" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.1, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.01, - 0.1, - 0.5, - 1.0 - ] - } - }, - { - "Name": "DecreaseLearningRate", + "Name": "PerformProjection", "Type": "Bool", - "Desc": "Decrease learning rate", + "Desc": "Perform projection to unit-ball? Typically used with batch size > 1.", "Aliases": [ - "decreaselr" + "project" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": true, + "Default": false, "SweepRange": { "RangeType": "Discrete", "Values": [ @@ -10955,23 +10553,6 @@ ] } }, - { - "Name": "L2RegularizerWeight", - "Type": "Float", - "Desc": "L2 Regularization Weight", - "Aliases": [ - "reg" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 0.5 - } - }, { "Name": "NumIterations", "Type": "Int", @@ -11010,76 +10591,43 @@ } }, { - "Name": "ResetWeightsAfterXExamples", - "Type": "Int", - "Desc": "Number of examples after which weights will be reset to the current average", - "Aliases": [ - "numreset" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "DoLazyUpdates", - "Type": "Bool", - "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", - "Aliases": [ - "lazy" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "RecencyGain", - "Type": "Float", - "Desc": "Extra weight given to more recent updates", - "Aliases": [ - "rg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "RecencyGainMulti", + "Name": "NoBias", "Type": "Bool", - "Desc": "Whether Recency Gain is multiplicative (vs. additive)", - "Aliases": [ - "rgm" - ], + "Desc": "No bias", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "Averaged", - "Type": "Bool", - "Desc": "Do averaging?", - "Aliases": [ - "avg" - ], + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": { + "Name": "PlattCalibrator" + } }, { - "Name": "AveragedTolerance", - "Type": "Float", - "Desc": "The inexactness tolerance for averaging", - "Aliases": [ - "avgtol" - ], + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.01 + "Default": 1000000 }, { "Name": "InitialWeights", @@ -11123,6 +10671,18 @@ "SortOrder": 150.0, "IsNullable": false, "Default": 1000000 + }, + { + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", + "Aliases": [ + "batch" + ], + "Required": false, + "SortOrder": 190.0, + "IsNullable": false, + "Default": 1 } ], "Outputs": [ @@ -11137,15 +10697,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.PcaAnomalyDetector", - "Desc": "Train an PCA Anomaly model.", - "FriendlyName": "PCA Anomaly Detector", - "ShortName": "pcaAnom", + "Name": "Trainers.LogisticRegressionBinaryClassifier", + "Desc": "Train a logistic regression binary model", + "FriendlyName": "Logistic Regression", + "ShortName": "lr", "Inputs": [ { "Name": "TrainingData", @@ -11170,6 +10730,18 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, { "Name": "WeightColumn", "Type": "String", @@ -11223,30 +10795,79 @@ "Default": "Auto" }, { - "Name": "Rank", - "Type": "Int", - "Desc": "The number of components in the PCA", + "Name": "ShowTrainingStats", + "Type": "Bool", + "Desc": "Show statistics of training examples.", "Aliases": [ - "k" + "stat" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": 20, + "Default": false + }, + { + "Name": "L2Weight", + "Type": "Float", + "Desc": "L2 regularization weight", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 + } + }, + { + "Name": "L1Weight", + "Type": "Float", + "Desc": "L1 regularization weight", + "Aliases": [ + "l1" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 + } + }, + { + "Name": "OptTol", + "Type": "Float", + "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Aliases": [ + "ot" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1E-07, "SweepRange": { "RangeType": "Discrete", "Values": [ - 10, - 20, - 40, - 80 + 0.0001, + 1E-07 ] } }, { - "Name": "Oversampling", + "Name": "MemorySize", "Type": "Int", - "Desc": "Oversampling parameter for randomized PCA training", + "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", + "Aliases": [ + "m" + ], "Required": false, "SortOrder": 50.0, "IsNullable": false, @@ -11254,42 +10875,125 @@ "SweepRange": { "RangeType": "Discrete", "Values": [ - 10, + 5, 20, - 40 + 50 ] } }, { - "Name": "Center", + "Name": "EnforceNonNegativity", "Type": "Bool", - "Desc": "If enabled, data is centered to be zero mean", + "Desc": "Enforce non-negative weights", "Aliases": [ - "center" + "nn" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 90.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": false }, { - "Name": "Seed", - "Type": "Int", - "Desc": "The seed for random number generation", + "Name": "InitWtsDiameter", + "Type": "Float", + "Desc": "Init weights diameter", "Aliases": [ - "seed" + "initwts" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 140.0, + "IsNullable": false, + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 + } + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum iterations.", + "Aliases": [ + "maxiter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 2147483647, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 2147483647 + } + }, + { + "Name": "SgdInitializationTolerance", + "Type": "Float", + "Desc": "Run SGD to initialize LR weights, converging to this tolerance", + "Aliases": [ + "sgd" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "Quiet", + "Type": "Bool", + "Desc": "If set to true, produce no output during training.", + "Aliases": [ + "q" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Whether or not to use threads. Default is true", + "Aliases": [ + "t" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Number of threads", + "Aliases": [ + "nt" + ], + "Required": false, + "SortOrder": 150.0, "IsNullable": true, "Default": null + }, + { + "Name": "DenseOptimizer", + "Type": "Bool", + "Desc": "Force densification of the internal optimization vectors", + "Aliases": [ + "do" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } } ], "Outputs": [ @@ -11300,19 +11004,20 @@ } ], "InputKind": [ - "IUnsupervisedTrainerWithWeight", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IAnomalyDetectionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.PoissonRegressor", - "Desc": "Train an Poisson regression model.", - "FriendlyName": "Poisson Regression", - "ShortName": "PR", + "Name": "Trainers.LogisticRegressionClassifier", + "Desc": "Train a logistic regression multi class model", + "FriendlyName": "Multi-class Logistic Regression", + "ShortName": "mlr", "Inputs": [ { "Name": "TrainingData", @@ -11401,6 +11106,18 @@ "IsNullable": false, "Default": "Auto" }, + { + "Name": "ShowTrainingStats", + "Type": "Bool", + "Desc": "Show statistics of training examples.", + "Aliases": [ + "stat" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": false + }, { "Name": "L2Weight", "Type": "Float", @@ -11604,40 +11321,16 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IMulticlassClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", - "Desc": "Train an SDCA binary model.", - "FriendlyName": "Fast Linear (SA-SDCA)", - "ShortName": "SDCA", + "Name": "Trainers.NaiveBayesClassifier", + "Desc": "Train a MultiClassNaiveBayesTrainer.", + "FriendlyName": "Multiclass Naive Bayes", + "ShortName": "MNB", "Inputs": [ - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -11649,29 +11342,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "L1Threshold", - "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", - "Aliases": [ - "l1" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 - ] - } - }, { "Name": "FeatureColumn", "Type": "String", @@ -11735,389 +11405,304 @@ "SortOrder": 6.0, "IsNullable": false, "Default": "Auto" - }, + } + ], + "Outputs": [ { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "SDCAClassificationLossFunction" - }, - "Desc": "Loss Function", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.OnlineGradientDescentRegressor", + "Desc": "Train a Online gradient descent perceptron.", + "FriendlyName": "Stochastic Gradient Descent (Regression)", + "ShortName": "ogd", + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "loss" + "data" ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": { - "Name": "LogLoss" - } + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "nt", - "t", - "threads" + "feat" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": true, - "Default": null + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" }, { - "Name": "PositiveInstanceWeight", - "Type": "Float", - "Desc": "Apply weight to the positive class, for imbalanced data", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "piw" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 1.0 + "Default": "Label" }, { - "Name": "Calibrator", + "Name": "NormalizeFeatures", "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" }, { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 1000000 + "Default": "Auto" }, { - "Name": "ConvergenceTolerance", + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "RegressionLossFunction" + }, + "Desc": "Loss Function", + "Aliases": [ + "loss" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": { + "Name": "SquaredLoss" + } + }, + { + "Name": "LearningRate", "Type": "Float", - "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Desc": "Learning rate", "Aliases": [ - "tol" + "lr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, "Default": 0.1, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.001, 0.01, 0.1, - 0.2 + 0.5, + 1.0 ] } }, { - "Name": "MaxIterations", - "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Name": "DecreaseLearningRate", + "Type": "Bool", + "Desc": "Decrease learning rate", "Aliases": [ - "iter" + "decreaselr" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true, "SweepRange": { "RangeType": "Discrete", "Values": [ - "", - 10, - 20, - 100 + false, + true ] } }, { - "Name": "Shuffle", - "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Name": "L2RegularizerWeight", + "Type": "Float", + "Desc": "L2 Regularization Weight", "Aliases": [ - "shuf" + "reg" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": true, + "Default": 0.0, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] + "RangeType": "Float", + "Min": 0.0, + "Max": 0.5 } }, { - "Name": "CheckFrequency", + "Name": "NumIterations", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Desc": "Number of iterations", "Aliases": [ - "checkFreq" + "iter" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 100, + "StepSize": 10.0, + "IsLogScale": true + } }, { - "Name": "BiasLearningRate", + "Name": "InitWtsDiameter", "Type": "Float", - "Desc": "The learning rate for adjusting bias from being regularized.", + "Desc": "Init weights diameter", "Aliases": [ - "blr" + "initwts" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 140.0, "IsNullable": false, "Default": 0.0, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 0.01, - 0.1, - 1.0 - ] + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 } - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IBinaryClassificationOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.StochasticDualCoordinateAscentClassifier", - "Desc": "Train an SDCA multi class model", - "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)", - "ShortName": "sasdcamc", - "Inputs": [ + }, { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", + "Name": "ResetWeightsAfterXExamples", + "Type": "Int", + "Desc": "Number of examples after which weights will be reset to the current average", "Aliases": [ - "l2" + "numreset" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } + "Default": null }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "DoLazyUpdates", + "Type": "Bool", + "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", "Aliases": [ - "data" + "lazy" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true }, { - "Name": "L1Threshold", + "Name": "RecencyGain", "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", + "Desc": "Extra weight given to more recent updates", "Aliases": [ - "l1" + "rg" ], "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 - ] - } + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "RecencyGainMulti", + "Type": "Bool", + "Desc": "Whether Recency Gain is multiplicative (vs. additive)", "Aliases": [ - "feat" + "rgm" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": false }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "Averaged", + "Type": "Bool", + "Desc": "Do averaging?", "Aliases": [ - "lab" + "avg" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" - }, - { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", - "Aliases": [ - "cache" - ], - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "SDCAClassificationLossFunction" - }, - "Desc": "Loss Function", - "Aliases": [ - "loss" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": { - "Name": "LogLoss" - } - }, - { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", - "Aliases": [ - "nt", - "t", - "threads" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": true, - "Default": null + "Default": true }, { - "Name": "ConvergenceTolerance", + "Name": "AveragedTolerance", "Type": "Float", - "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Desc": "The inexactness tolerance for averaging", "Aliases": [ - "tol" + "avgtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.1, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.001, - 0.01, - 0.1, - 0.2 - ] - } + "Default": 0.01 }, { - "Name": "MaxIterations", - "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Name": "InitialWeights", + "Type": "String", + "Desc": "Initial Weights and bias, comma-separated", "Aliases": [ - "iter" + "initweights" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 10, - 20, - 100 - ] - } + "IsNullable": false, + "Default": null }, { "Name": "Shuffle", "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Desc": "Whether to shuffle for each training iteration", "Aliases": [ "shuf" ], @@ -12134,37 +11719,16 @@ } }, { - "Name": "CheckFrequency", + "Name": "StreamingCacheSize", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", - "Aliases": [ - "checkFreq" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "BiasLearningRate", - "Type": "Float", - "Desc": "The learning rate for adjusting bias from being regularized.", + "Desc": "Size of cache when trained in Scope", "Aliases": [ - "blr" + "cache" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 0.01, - 0.1, - 1.0 - ] - } + "Default": 1000000 } ], "Outputs": [ @@ -12179,40 +11743,16 @@ "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.StochasticDualCoordinateAscentRegressor", - "Desc": "Train an SDCA regression model", - "FriendlyName": "Fast Linear Regression (SA-SDCA)", - "ShortName": "sasdcar", + "Name": "Trainers.PcaAnomalyDetector", + "Desc": "Train an PCA Anomaly model.", + "FriendlyName": "PCA Anomaly Detector", + "ShortName": "pcaAnom", "Inputs": [ - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -12224,29 +11764,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "L1Threshold", - "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", - "Aliases": [ - "l1" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 - ] - } - }, { "Name": "FeatureColumn", "Type": "String", @@ -12260,16 +11777,16 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "WeightColumn", "Type": "String", - "Desc": "Column to use for labels", + "Desc": "Column to use for example weight", "Aliases": [ - "lab" + "weight" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": "Label" + "Default": "Weight" }, { "Name": "NormalizeFeatures", @@ -12312,84 +11829,49 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "SDCARegressionLossFunction" - }, - "Desc": "Loss Function", + "Name": "Rank", + "Type": "Int", + "Desc": "The number of components in the PCA", "Aliases": [ - "loss" + "k" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": { - "Name": "SquaredLoss" + "Default": 20, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 10, + 20, + 40, + 80 + ] } }, { - "Name": "NumThreads", + "Name": "Oversampling", "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", - "Aliases": [ - "nt", - "t", - "threads" - ], + "Desc": "Oversampling parameter for randomized PCA training", "Required": false, "SortOrder": 50.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "ConvergenceTolerance", - "Type": "Float", - "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", - "Aliases": [ - "tol" - ], - "Required": false, - "SortOrder": 150.0, "IsNullable": false, - "Default": 0.01, + "Default": 20, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.001, - 0.01, - 0.1, - 0.2 - ] - } - }, - { - "Name": "MaxIterations", - "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", 10, 20, - 100 + 40 ] } }, { - "Name": "Shuffle", + "Name": "Center", "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Desc": "If enabled, data is centered to be zero mean", "Aliases": [ - "shuf" + "center" ], "Required": false, "SortOrder": 150.0, @@ -12404,37 +11886,16 @@ } }, { - "Name": "CheckFrequency", + "Name": "Seed", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Desc": "The seed for random number generation", "Aliases": [ - "checkFreq" + "seed" ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null - }, - { - "Name": "BiasLearningRate", - "Type": "Float", - "Desc": "The learning rate for adjusting bias from being regularized.", - "Aliases": [ - "blr" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 0.01, - 0.1, - 1.0 - ] - } } ], "Outputs": [ @@ -12445,19 +11906,19 @@ } ], "InputKind": [ - "ITrainerInputWithLabel", + "IUnsupervisedTrainerWithWeight", "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IAnomalyDetectionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.StochasticGradientDescentBinaryClassifier", - "Desc": "Train an Hogwild SGD binary model.", - "FriendlyName": "Hogwild SGD (binary)", - "ShortName": "HogwildSGD", + "Name": "Trainers.PoissonRegressor", + "Desc": "Train an Poisson regression model.", + "FriendlyName": "Poisson Regression", + "ShortName": "PR", "Inputs": [ { "Name": "TrainingData", @@ -12547,150 +12008,169 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "ClassificationLossFunction" - }, - "Desc": "Loss Function", + "Name": "L2Weight", + "Type": "Float", + "Desc": "L2 regularization weight", "Aliases": [ - "loss" + "l2" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": { - "Name": "LogLoss" + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 } }, { - "Name": "L2Const", + "Name": "L1Weight", "Type": "Float", - "Desc": "L2 regularizer constant", + "Desc": "L1 regularization weight", "Aliases": [ - "l2" + "l1" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": 1E-06, + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 + } + }, + { + "Name": "OptTol", + "Type": "Float", + "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Aliases": [ + "ot" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1E-07, "SweepRange": { "RangeType": "Discrete", "Values": [ - 1E-07, - 5E-07, - 1E-06, - 5E-06, - 1E-05 + 0.0001, + 1E-07 ] } }, { - "Name": "NumThreads", + "Name": "MemorySize", "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", + "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", "Aliases": [ - "nt", - "t", - "threads" + "m" ], "Required": false, "SortOrder": 50.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 20, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 5, + 20, + 50 + ] + } }, { - "Name": "ConvergenceTolerance", + "Name": "EnforceNonNegativity", + "Type": "Bool", + "Desc": "Enforce non-negative weights", + "Aliases": [ + "nn" + ], + "Required": false, + "SortOrder": 90.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "InitWtsDiameter", "Type": "Float", - "Desc": "Exponential moving averaged improvement tolerance for convergence", + "Desc": "Init weights diameter", "Aliases": [ - "tol" + "initwts" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 140.0, "IsNullable": false, - "Default": 0.0001, + "Default": 0.0, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.01, - 0.001, - 0.0001, - 1E-05 - ] + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 } }, { "Name": "MaxIterations", "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning.", + "Desc": "Maximum iterations.", "Aliases": [ - "iter" + "maxiter" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 20, + "Default": 2147483647, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 5, - 10, - 20 - ] + "RangeType": "Long", + "Min": 1, + "Max": 2147483647 } }, { - "Name": "InitLearningRate", + "Name": "SgdInitializationTolerance", "Type": "Float", - "Desc": "Initial learning rate (only used by SGD)", + "Desc": "Run SGD to initialize LR weights, converging to this tolerance", "Aliases": [ - "ilr", - "lr" + "sgd" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.01 + "Default": 0.0 }, { - "Name": "Shuffle", + "Name": "Quiet", "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Desc": "If set to true, produce no output during training.", "Aliases": [ - "shuf" + "q" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": false }, { - "Name": "PositiveInstanceWeight", - "Type": "Float", - "Desc": "Apply weight to the positive class, for imbalanced data", + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Whether or not to use threads. Default is true", "Aliases": [ - "piw" + "t" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": true }, { - "Name": "CheckFrequency", + "Name": "NumThreads", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Default equals number of threads", + "Desc": "Number of threads", "Aliases": [ - "checkFreq" + "nt" ], "Required": false, "SortOrder": 150.0, @@ -12698,30 +12178,26 @@ "Default": null }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "DenseOptimizer", + "Type": "Bool", + "Desc": "Force densification of the internal optimization vectors", + "Aliases": [ + "do" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] } - }, - { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 - } - ], - "Outputs": [ + } + ], + "Outputs": [ { "Name": "PredictorModel", "Type": "PredictorModel", @@ -12734,1023 +12210,1198 @@ "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Transforms.ApproximateBootstrapSampler", - "Desc": "Approximate bootstrap sampling.", - "FriendlyName": "Bootstrap Sample Transform", - "ShortName": "BootstrapSample", + "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", + "Desc": "Train an SDCA binary model.", + "FriendlyName": "Fast Linear (SA-SDCA)", + "ShortName": "SDCA", "Inputs": [ { - "Name": "Data", + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } + }, + { + "Name": "TrainingData", "Type": "DataView", - "Desc": "Input dataset", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Complement", - "Type": "Bool", - "Desc": "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.", + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", "Aliases": [ - "comp" + "l1" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 + ] + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, "IsNullable": false, - "Default": false + "Default": "Features" }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed. If unspecified random state will be instead derived from the environment.", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" }, { - "Name": "ShuffleInput", - "Type": "Bool", - "Desc": "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "si" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": true + "Default": "Auto" }, { - "Name": "PoolSize", - "Type": "Int", - "Desc": "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "pool" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 1000 - } - ], - "Outputs": [ + "Default": "Auto" + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "SDCAClassificationLossFunction" + }, + "Desc": "Loss Function", + "Aliases": [ + "loss" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": { + "Name": "LogLoss" + } }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.BinaryPredictionScoreColumnsRenamer", - "Desc": "For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.", - "FriendlyName": "Rename Binary Prediction Score Columns", - "ShortName": null, - "Inputs": [ + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Aliases": [ + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "PositiveInstanceWeight", + "Type": "Float", + "Desc": "Apply weight to the positive class, for imbalanced data", + "Aliases": [ + "piw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The predictor model used in scoring", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - } - ], - "Outputs": [ + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.BinNormalizer", - "Desc": "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.", - "FriendlyName": "Binning Normalizer", - "ShortName": "Bin", - "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "NumBins", - "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", - "Aliases": [ - "bins" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", "Aliases": [ - "col" + "tol" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": null - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Default": 0.1, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.001, + 0.01, + 0.1, + 0.2 + ] + } }, { - "Name": "NumBins", + "Name": "MaxIterations", "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", "Aliases": [ - "bins" + "iter" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 1024 + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 10, + 20, + 100 + ] + } }, { - "Name": "FixZero", + "Name": "Shuffle", "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Desc": "Shuffle data every epoch?", "Aliases": [ - "zero" + "shuf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "MaxTrainingExamples", + "Name": "CheckFrequency", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ - "maxtrain" + "checkFreq" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "BiasLearningRate", + "Type": "Float", + "Desc": "The learning rate for adjusting bias from being regularized.", + "Aliases": [ + "blr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000000 + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 0.01, + 0.1, + 1.0 + ] + } } ], "Outputs": [ { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" } ], "InputKind": [ - "ITransformInput" + "ITrainerInputWithLabel", + "ITrainerInput" ], "OutputKind": [ - "ITransformOutput" + "IBinaryClassificationOutput", + "ITrainerOutput" ] }, { - "Name": "Transforms.CategoricalHashOneHotVectorizer", - "Desc": "Encodes the categorical variable with hash-based encoding", - "FriendlyName": "Categorical Hash Transform", - "ShortName": null, + "Name": "Trainers.StochasticDualCoordinateAscentClassifier", + "Desc": "Train an SDCA multi class model", + "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)", + "ShortName": "sasdcamc", "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "OutputKind", - "Type": { - "Kind": "Enum", - "Values": [ - "Bag", - "Ind", - "Key", - "Bin" - ] - }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 102.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "HashBits", - "Type": "Int", - "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.", - "Aliases": [ - "bits" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", - "Aliases": [ - "ord" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "InvertHash", - "Type": "Int", - "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", - "Aliases": [ - "ih" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:hashBits:src)", + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", "Aliases": [ - "col" + "l2" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } }, { - "Name": "Data", + "Name": "TrainingData", "Type": "DataView", - "Desc": "Input dataset", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "HashBits", - "Type": "Int", - "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.", + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", "Aliases": [ - "bits" + "l1" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 + ] + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": 16 + "Default": "Features" }, { - "Name": "OutputKind", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "NormalizeFeatures", "Type": { "Kind": "Enum", "Values": [ - "Bag", - "Ind", - "Key", - "Bin" + "No", + "Warn", + "Auto", + "Yes" ] }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Desc": "Normalize option for the feature column", "Aliases": [ - "kind" + "norm" ], "Required": false, - "SortOrder": 102.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": "Bag" + "Default": "Auto" }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 314489979 + "Default": "Auto" }, { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "SDCAClassificationLossFunction" + }, + "Desc": "Loss Function", "Aliases": [ - "ord" + "loss" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": true + "Default": { + "Name": "LogLoss" + } }, { - "Name": "InvertHash", + "Name": "NumThreads", "Type": "Int", - "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ - "ih" + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Aliases": [ + "tol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 - } - ], - "Outputs": [ + "Default": 0.1, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.001, + 0.01, + 0.1, + 0.2 + ] + } + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 10, + 20, + 100 + ] + } }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.CategoricalOneHotVectorizer", - "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary", - "FriendlyName": "Categorical Transform", - "ShortName": null, - "Inputs": [ + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Shuffle data every epoch?", + "Aliases": [ + "shuf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + }, { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "OutputKind", - "Type": { - "Kind": "Enum", - "Values": [ - "Bag", - "Ind", - "Key", - "Bin" - ] - }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep when auto-training", - "Aliases": [ - "max" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", - "Aliases": [ - "textkv" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", + "Name": "CheckFrequency", + "Type": "Int", + "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ - "col" + "checkFreq" ], - "Required": true, + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "BiasLearningRate", + "Type": "Float", + "Desc": "The learning rate for adjusting bias from being regularized.", + "Aliases": [ + "blr" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 0.01, + 0.1, + 1.0 + ] + } + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.StochasticDualCoordinateAscentRegressor", + "Desc": "Train an SDCA regression model", + "FriendlyName": "Fast Linear Regression (SA-SDCA)", + "ShortName": "sasdcar", + "Inputs": [ + { + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", + "Aliases": [ + "l2" + ], + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } }, { - "Name": "Data", + "Name": "TrainingData", "Type": "DataView", - "Desc": "Input dataset", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", "Aliases": [ - "max" + "l1" ], "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": 1000000 - }, - { - "Name": "OutputKind", - "Type": { - "Kind": "Enum", + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", "Values": [ - "Bag", - "Ind", - "Key", - "Bin" + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 ] - }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "kind" + "feat" ], "Required": false, - "SortOrder": 102.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": "Ind" + "Default": "Features" }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], "Required": false, - "SortOrder": 106.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": null + "Default": "Label" }, { - "Name": "Sort", + "Name": "NormalizeFeatures", "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "No", + "Warn", + "Auto", + "Yes" ] }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], "Required": false, - "SortOrder": 113.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": "Occurrence" + "Default": "Auto" }, { - "Name": "TextKeyValues", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "SDCARegressionLossFunction" + }, + "Desc": "Loss Function", + "Aliases": [ + "loss" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": { + "Name": "SquaredLoss" + } + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Aliases": [ + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Aliases": [ + "tol" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.001, + 0.01, + 0.1, + 0.2 + ] + } + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 10, + 20, + 100 + ] + } + }, + { + "Name": "Shuffle", "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Desc": "Shuffle data every epoch?", "Aliases": [ - "textkv" + "shuf" ], "Required": false, - "SortOrder": 114.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + }, + { + "Name": "CheckFrequency", + "Type": "Int", + "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Aliases": [ + "checkFreq" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "BiasLearningRate", + "Type": "Float", + "Desc": "The learning rate for adjusting bias from being regularized.", + "Aliases": [ + "blr" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 0.01, + 0.1, + 1.0 + ] + } } ], "Outputs": [ { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" } ], "InputKind": [ - "ITransformInput" + "ITrainerInputWithLabel", + "ITrainerInput" ], "OutputKind": [ - "ITransformOutput" + "IRegressionOutput", + "ITrainerOutput" ] }, { - "Name": "Transforms.CharacterTokenizer", - "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.", - "FriendlyName": "Character Tokenizer Transform", - "ShortName": "CharToken", + "Name": "Trainers.StochasticGradientDescentBinaryClassifier", + "Desc": "Train an Hogwild SGD binary model.", + "FriendlyName": "Hogwild SGD (binary)", + "ShortName": "HogwildSGD", "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "col" + "data" ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "UseMarkerChars", - "Type": "Bool", - "Desc": "Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03)", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "mark" + "feat" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": true - } - ], - "Outputs": [ + "Default": "Features" + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.ColumnConcatenator", - "Desc": "Concatenates two columns of the same item type.", - "FriendlyName": "Concat Transform", - "ShortName": "Concat", - "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:srcs)", + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", "Aliases": [ - "col" + "weight" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": "Weight" }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.ColumnCopier", - "Desc": "Duplicates columns from the dataset", - "FriendlyName": "Copy Columns Transform", - "ShortName": "Copy", - "Inputs": [ - { - "Name": "Column", + "Name": "NormalizeFeatures", "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Normalize option for the feature column", "Aliases": [ - "col" + "norm" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.ColumnDropper", - "Desc": "Drops columns from the dataset", - "FriendlyName": "Drop Columns Transform", - "ShortName": "Drop", - "Inputs": [ - { - "Name": "Column", + "Name": "Caching", "Type": { - "Kind": "Array", - "ItemType": "String" + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] }, - "Desc": "Column name to drop", + "Desc": "Whether learner should cache input training data", "Aliases": [ - "col" + "cache" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.ColumnSelector", - "Desc": "Selects a set of columns, dropping all others", - "FriendlyName": "Select Columns", - "ShortName": null, - "Inputs": [ - { - "Name": "Column", + "Name": "LossFunction", "Type": { - "Kind": "Array", - "ItemType": "String" + "Kind": "Component", + "ComponentKind": "ClassificationLossFunction" }, - "Desc": "Column name to keep", + "Desc": "Loss Function", "Aliases": [ - "col" + "loss" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": null + "Default": { + "Name": "LogLoss" + } }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1E-06, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1E-07, + 5E-07, + 1E-06, + 5E-06, + 1E-05 + ] + } + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", + "Aliases": [ + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "Exponential moving averaged improvement tolerance for convergence", + "Aliases": [ + "tol" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0001, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.001, + 0.0001, + 1E-05 + ] + } + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning.", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 20, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 5, + 10, + 20 + ] + } + }, + { + "Name": "InitLearningRate", + "Type": "Float", + "Desc": "Initial learning rate (only used by SGD)", + "Aliases": [ + "ilr", + "lr" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01 + }, + { + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Shuffle data every epoch?", + "Aliases": [ + "shuf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + }, + { + "Name": "PositiveInstanceWeight", + "Type": "Float", + "Desc": "Apply weight to the positive class, for imbalanced data", + "Aliases": [ + "piw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "CheckFrequency", + "Type": "Int", + "Desc": "Convergence check frequency (in terms of number of iterations). Default equals number of threads", + "Aliases": [ + "checkFreq" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Transforms.ApproximateBootstrapSampler", + "Desc": "Approximate bootstrap sampling.", + "FriendlyName": "Bootstrap Sample Transform", + "ShortName": "BootstrapSample", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "Complement", + "Type": "Bool", + "Desc": "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.", + "Aliases": [ + "comp" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed. If unspecified random state will be instead derived from the environment.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ShuffleInput", + "Type": "Bool", + "Desc": "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.", + "Aliases": [ + "si" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "PoolSize", + "Type": "Int", + "Desc": "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.", + "Aliases": [ + "pool" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000 } ], "Outputs": [ @@ -13773,206 +13424,11 @@ ] }, { - "Name": "Transforms.ColumnTypeConverter", - "Desc": "Converts a column to a different type, using standard conversions.", - "FriendlyName": "Convert Transform", - "ShortName": "Convert", + "Name": "Transforms.BinaryPredictionScoreColumnsRenamer", + "Desc": "For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.", + "FriendlyName": "Rename Binary Prediction Score Columns", + "ShortName": null, "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "ResultType", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "The result type", - "Aliases": [ - "type" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Range", - "Type": "String", - "Desc": "For a key column, this defines the range of values", - "Aliases": [ - "key" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:type:src)", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "ResultType", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "The result type", - "Aliases": [ - "type" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Range", - "Type": "String", - "Desc": "For a key column, this defines the range of values", - "Aliases": [ - "key" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.CombinerByContiguousGroupId", - "Desc": "Groups values of a scalar column into a vector, by a contiguous group ID", - "FriendlyName": "Group Transform", - "ShortName": "Group", - "Inputs": [ - { - "Name": "GroupKey", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Columns to group by", - "Aliases": [ - "g" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, { "Name": "Data", "Type": "DataView", @@ -13982,15 +13438,9 @@ "IsNullable": false }, { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Columns to group together", - "Aliases": [ - "col" - ], + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model used in scoring", "Required": true, "SortOrder": 2.0, "IsNullable": false @@ -14016,10 +13466,10 @@ ] }, { - "Name": "Transforms.ConditionalNormalizer", - "Desc": "Normalize the columns only if needed", - "FriendlyName": "Normalize If Needed", - "ShortName": null, + "Name": "Transforms.BinNormalizer", + "Desc": "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.", + "FriendlyName": "Binning Normalizer", + "ShortName": "Bin", "Inputs": [ { "Name": "Column", @@ -14028,6 +13478,18 @@ "ItemType": { "Kind": "Struct", "Fields": [ + { + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", + "Aliases": [ + "bins" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "FixZero", "Type": "Bool", @@ -14083,9 +13545,10 @@ "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -14095,6 +13558,18 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", + "Aliases": [ + "bins" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1024 + }, { "Name": "FixZero", "Type": "Bool", @@ -14134,149 +13609,50 @@ ], "InputKind": [ "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" ] }, { - "Name": "Transforms.DataCache", - "Desc": "Caches using the specified cache option.", - "FriendlyName": "Cache Data", + "Name": "Transforms.CategoricalHashOneHotVectorizer", + "Desc": "Encodes the categorical variable with hash-based encoding", + "FriendlyName": "Categorical Hash Transform", "ShortName": null, "Inputs": [ { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Memory", - "Disk" - ] - }, - "Desc": "Caching strategy", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false, - "Default": "Memory" - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Dataset" - } - ], - "InputKind": [ - "ITransformInput" - ] - }, - { - "Name": "Transforms.DatasetScorer", - "Desc": "Score a dataset with a predictor model", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "The dataset to be scored", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The predictor model to apply to data", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - }, - { - "Name": "Suffix", - "Type": "String", - "Desc": "Suffix to append to the score columns", - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": null - } - ], - "Outputs": [ - { - "Name": "ScoredData", - "Type": "DataView", - "Desc": "The scored dataset" - }, - { - "Name": "ScoringTransform", - "Type": "TransformModel", - "Desc": "The scoring transform" - } - ] - }, - { - "Name": "Transforms.DatasetTransformScorer", - "Desc": "Score a dataset with a transform model", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "The dataset to be scored", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "TransformModel", - "Type": "TransformModel", - "Desc": "The transform model to apply to data", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "ScoredData", - "Type": "DataView", - "Desc": "The scored dataset" - }, - { - "Name": "ScoringTransform", - "Type": "TransformModel", - "Desc": "The scoring transform" - } - ] - }, - { - "Name": "Transforms.Dictionarizer", - "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", - "FriendlyName": "Term Transform", - "ShortName": "TermTransform", - "Inputs": [ - { - "Name": "Column", + "Name": "Column", "Type": { "Kind": "Array", "ItemType": { "Kind": "Struct", "Fields": [ { - "Name": "MaxNumTerms", + "Name": "OutputKind", + "Type": { + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] + }, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 102.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "HashBits", "Type": "Int", - "Desc": "Maximum number of terms to keep when auto-training", + "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.", "Aliases": [ - "max" + "bits" ], "Required": false, "SortOrder": 150.0, @@ -14284,38 +13660,32 @@ "Default": null }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": true, "Default": null }, { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Name": "Ordered", + "Type": "Bool", + "Desc": "Whether the position of each term should be included in the hash", + "Aliases": [ + "ord" + ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Name": "InvertHash", + "Type": "Int", + "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", "Aliases": [ - "textkv" + "ih" ], "Required": false, "SortOrder": 150.0, @@ -14349,14 +13719,13 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:hashBits:src)", "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -14367,55 +13736,69 @@ "IsNullable": false }, { - "Name": "MaxNumTerms", + "Name": "HashBits", "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.", "Aliases": [ - "max" + "bits" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 1000000 + "Default": 16 }, { - "Name": "Term", + "Name": "OutputKind", "Type": { - "Kind": "Array", - "ItemType": "String" + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] }, - "Desc": "List of terms", + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Aliases": [ + "kind" + ], "Required": false, - "SortOrder": 106.0, + "SortOrder": 102.0, "IsNullable": false, - "Default": null + "Default": "Bag" }, { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", "Required": false, - "SortOrder": 113.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Occurrence" + "Default": 314489979 }, { - "Name": "TextKeyValues", + "Name": "Ordered", "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Desc": "Whether the position of each term should be included in the hash", "Aliases": [ - "textkv" + "ord" ], "Required": false, - "SortOrder": 114.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true + }, + { + "Name": "InvertHash", + "Type": "Int", + "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", + "Aliases": [ + "ih" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 } ], "Outputs": [ @@ -14438,30 +13821,202 @@ ] }, { - "Name": "Transforms.FeatureCombiner", - "Desc": "Combines all the features into one feature column.", - "FriendlyName": "Feature Combiner", - "ShortName": "fc", + "Name": "Transforms.CategoricalOneHotVectorizer", + "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary", + "FriendlyName": "Categorical Transform", + "ShortName": null, "Inputs": [ { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Features", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "OutputKind", + "Type": { + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] + }, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep when auto-training", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep per column when auto-training", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": 1000000 + }, + { + "Name": "OutputKind", + "Type": { + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] + }, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 102.0, + "IsNullable": false, + "Default": "Ind" + }, + { + "Name": "Term", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Features", + "Desc": "List of terms", "Required": false, - "SortOrder": 2.0, + "SortOrder": 106.0, "IsNullable": false, "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 113.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 114.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -14484,18 +14039,46 @@ ] }, { - "Name": "Transforms.FeatureSelectorByCount", - "Desc": "Selects the slots for which the count of non-default values is greater than or equal to a threshold.", - "FriendlyName": "Count Feature Selection Transform", - "ShortName": null, + "Name": "Transforms.CharacterTokenizer", + "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.", + "FriendlyName": "Character Tokenizer Transform", + "ShortName": "CharToken", "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": "String" + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } }, - "Desc": "Columns to use for feature selection", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], @@ -14503,18 +14086,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "Count", - "Type": "Int", - "Desc": "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved", - "Aliases": [ - "c" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 1 - }, { "Name": "Data", "Type": "DataView", @@ -14522,6 +14093,18 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "UseMarkerChars", + "Type": "Bool", + "Desc": "Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03)", + "Aliases": [ + "mark" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -14544,95 +14127,10 @@ ] }, { - "Name": "Transforms.FeatureSelectorByMutualInformation", - "Desc": "Selects the top k slots across all specified columns ordered by their mutual information with the label column.", - "FriendlyName": "Mutual Information Feature Selection Transform", - "ShortName": "MIFeatureSelection", - "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Columns to use for feature selection", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "SlotsInOutput", - "Type": "Int", - "Desc": "The maximum number of slots to preserve in output", - "Aliases": [ - "topk", - "numSlotsToKeep" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 1000 - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", - "Aliases": [ - "lab" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": "Label" - }, - { - "Name": "NumBins", - "Type": "Int", - "Desc": "Max number of bins for R4/R8 columns, power of 2 recommended", - "Aliases": [ - "bins" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 256 - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.GlobalContrastNormalizer", - "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.", - "FriendlyName": "Global Contrast Normalization Transform", - "ShortName": "Gcn", + "Name": "Transforms.ColumnConcatenator", + "Desc": "Concatenates two columns of the same item type.", + "FriendlyName": "Concat Transform", + "ShortName": "Concat", "Inputs": [ { "Name": "Column", @@ -14641,33 +14139,6 @@ "ItemType": { "Kind": "Struct", "Fields": [ - { - "Name": "UseStdDev", - "Type": "Bool", - "Desc": "Normalize by standard deviation rather than L2 norm", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Scale", - "Type": "Float", - "Desc": "Scale features by this value", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, { "Name": "Name", "Type": "String", @@ -14682,7 +14153,10 @@ }, { "Name": "Source", - "Type": "String", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, "Desc": "Name of the source column", "Aliases": [ "src" @@ -14695,23 +14169,13 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:srcs)", "Aliases": [ "col" ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": true + "IsNullable": false }, { "Name": "Data", @@ -14720,27 +14184,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "UseStdDev", - "Type": "Bool", - "Desc": "Normalize by standard deviation rather than L2 norm", - "Aliases": [ - "useStd" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "Scale", - "Type": "Float", - "Desc": "Scale features by this value", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 } ], "Outputs": [ @@ -14763,10 +14206,10 @@ ] }, { - "Name": "Transforms.HashConverter", - "Desc": "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.", - "FriendlyName": "Hash Join Transform", - "ShortName": "HashJoin", + "Name": "Transforms.ColumnCopier", + "Desc": "Duplicates columns from the dataset", + "FriendlyName": "Copy Columns Transform", + "ShortName": "Copy", "Inputs": [ { "Name": "Column", @@ -14775,57 +14218,6 @@ "ItemType": { "Kind": "Struct", "Fields": [ - { - "Name": "Join", - "Type": "Bool", - "Desc": "Whether the values need to be combined for a single hash", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "CustomSlotMap", - "Type": "String", - "Desc": "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "HashBits", - "Type": "Int", - "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", - "Aliases": [ - "bits" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", - "Aliases": [ - "ord" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, { "Name": "Name", "Type": "String", @@ -14868,48 +14260,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "HashBits", - "Type": "Int", - "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", - "Aliases": [ - "bits" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": 31 - }, - { - "Name": "Join", - "Type": "Bool", - "Desc": "Whether the values need to be combined for a single hash", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 314489979 - }, - { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", - "Aliases": [ - "ord" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true } ], "Outputs": [ @@ -14932,46 +14282,18 @@ ] }, { - "Name": "Transforms.KeyToTextConverter", - "Desc": "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", - "FriendlyName": "Key To Value Transform", - "ShortName": null, + "Name": "Transforms.ColumnDropper", + "Desc": "Drops columns from the dataset", + "FriendlyName": "Drop Columns Transform", + "ShortName": "Drop", "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "ItemType": "String" }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Column name to drop", "Aliases": [ "col" ], @@ -15008,11 +14330,26 @@ ] }, { - "Name": "Transforms.LabelColumnKeyBooleanConverter", - "Desc": "Transforms the label to either key or bool (if needed) to make it suitable for classification.", - "FriendlyName": "Prepare Classification Label", + "Name": "Transforms.ColumnSelector", + "Desc": "Selects a set of columns, dropping all others", + "FriendlyName": "Select Columns", "ShortName": null, "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column name to keep", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, { "Name": "Data", "Type": "DataView", @@ -15020,23 +14357,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "The label column", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - }, - { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Convert the key values to text", - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": true } ], "Outputs": [ @@ -15059,10 +14379,10 @@ ] }, { - "Name": "Transforms.LabelIndicator", - "Desc": "Label remapper used by OVA", - "FriendlyName": "LabelIndicator", - "ShortName": "LabelIndictator", + "Name": "Transforms.ColumnTypeConverter", + "Desc": "Converts a column to a different type, using standard conversions.", + "FriendlyName": "Convert Transform", + "ShortName": "Convert", "Inputs": [ { "Name": "Column", @@ -15072,17 +14392,57 @@ "Kind": "Struct", "Fields": [ { - "Name": "ClassIndex", - "Type": "Int", - "Desc": "The positive example class for binary classification.", + "Name": "ResultType", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "The result type", "Aliases": [ - "index" + "type" ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, + { + "Name": "Range", + "Type": "String", + "Desc": "For a key column, this defines the range of values", + "Aliases": [ + "key" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, { "Name": "Name", "Type": "String", @@ -15110,14 +14470,13 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:type:src)", "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -15128,16 +14487,56 @@ "IsNullable": false }, { - "Name": "ClassIndex", - "Type": "Int", - "Desc": "Label of the positive class.", + "Name": "ResultType", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "The result type", "Aliases": [ - "index" + "type" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Range", + "Type": "String", + "Desc": "For a key column, this defines the range of values", + "Aliases": [ + "key" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": null } ], "Outputs": [ @@ -15160,11 +14559,26 @@ ] }, { - "Name": "Transforms.LabelToFloatConverter", - "Desc": "Transforms the label to float to make it suitable for regression.", - "FriendlyName": "Prepare Regression Label", - "ShortName": null, + "Name": "Transforms.CombinerByContiguousGroupId", + "Desc": "Groups values of a scalar column into a vector, by a contiguous group ID", + "FriendlyName": "Group Transform", + "ShortName": "Group", "Inputs": [ + { + "Name": "GroupKey", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to group by", + "Aliases": [ + "g" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, { "Name": "Data", "Type": "DataView", @@ -15174,9 +14588,15 @@ "IsNullable": false }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "The label column", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to group together", + "Aliases": [ + "col" + ], "Required": true, "SortOrder": 2.0, "IsNullable": false @@ -15202,10 +14622,10 @@ ] }, { - "Name": "Transforms.LogMeanVarianceNormalizer", - "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.", - "FriendlyName": "LogMeanVar Normalizer", - "ShortName": "LogMeanVar", + "Name": "Transforms.ConditionalNormalizer", + "Desc": "Normalize the columns only if needed", + "FriendlyName": "Normalize If Needed", + "ShortName": null, "Inputs": [ { "Name": "Column", @@ -15214,6 +14634,18 @@ "ItemType": { "Kind": "Struct", "Fields": [ + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "MaxTrainingExamples", "Type": "Int", @@ -15257,10 +14689,9 @@ "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -15271,11 +14702,11 @@ "IsNullable": false }, { - "Name": "UseCdf", + "Name": "FixZero", "Type": "Bool", - "Desc": "Whether to use CDF as the output", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "cdf" + "zero" ], "Required": false, "SortOrder": 150.0, @@ -15309,166 +14740,112 @@ ], "InputKind": [ "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.LpNormalizer", - "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.", - "FriendlyName": "Lp-Norm Normalizer", - "ShortName": "lpnorm", + "Name": "Transforms.DataCache", + "Desc": "Caches using the specified cache option.", + "FriendlyName": "Cache Data", + "ShortName": null, "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "NormKind", - "Type": { - "Kind": "Enum", - "Values": [ - "L2Norm", - "StdDev", - "L1Norm", - "LInf" - ] - }, - "Desc": "The norm to use to normalize each sample", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", - "Aliases": [ - "col" - ], + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "NormKind", + "Name": "Caching", "Type": { "Kind": "Enum", "Values": [ - "L2Norm", - "StdDev", - "L1Norm", - "LInf" + "Memory", + "Disk" ] }, - "Desc": "The norm to use to normalize each sample", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 1.0, + "Desc": "Caching strategy", + "Required": true, + "SortOrder": 2.0, "IsNullable": false, - "Default": "L2Norm" - }, + "Default": "Memory" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Dataset" + } + ], + "InputKind": [ + "ITransformInput" + ] + }, + { + "Name": "Transforms.DatasetScorer", + "Desc": "Score a dataset with a predictor model", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ { "Name": "Data", "Type": "DataView", - "Desc": "Input dataset", + "Desc": "The dataset to be scored", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model to apply to data", + "Required": true, "SortOrder": 2.0, + "IsNullable": false + }, + { + "Name": "Suffix", + "Type": "String", + "Desc": "Suffix to append to the score columns", + "Required": false, + "SortOrder": 3.0, "IsNullable": false, - "Default": false + "Default": null } ], "Outputs": [ { - "Name": "OutputData", + "Name": "ScoredData", "Type": "DataView", - "Desc": "Transformed dataset" + "Desc": "The scored dataset" }, { - "Name": "Model", + "Name": "ScoringTransform", "Type": "TransformModel", - "Desc": "Transform model" + "Desc": "The scoring transform" } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.ManyHeterogeneousModelCombiner", - "Desc": "Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.", + "Name": "Transforms.DatasetTransformScorer", + "Desc": "Score a dataset with a transform model", "FriendlyName": null, "ShortName": null, "Inputs": [ { - "Name": "TransformModels", - "Type": { - "Kind": "Array", - "ItemType": "TransformModel" - }, - "Desc": "Transform model", + "Name": "Data", + "Type": "DataView", + "Desc": "The dataset to be scored", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model", + "Name": "TransformModel", + "Type": "TransformModel", + "Desc": "The transform model to apply to data", "Required": true, "SortOrder": 2.0, "IsNullable": false @@ -15476,17 +14853,22 @@ ], "Outputs": [ { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model" + "Name": "ScoredData", + "Type": "DataView", + "Desc": "The scored dataset" + }, + { + "Name": "ScoringTransform", + "Type": "TransformModel", + "Desc": "The scoring transform" } ] }, { - "Name": "Transforms.MeanVarianceNormalizer", - "Desc": "Normalizes the data based on the computed mean and variance of the data.", - "FriendlyName": "MeanVar Normalizer", - "ShortName": "MeanVar", + "Name": "Transforms.Dictionarizer", + "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", + "FriendlyName": "Term Transform", + "ShortName": "TermTransform", "Inputs": [ { "Name": "Column", @@ -15496,11 +14878,11 @@ "Kind": "Struct", "Fields": [ { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep when auto-training", "Aliases": [ - "zero" + "max" ], "Required": false, "SortOrder": 150.0, @@ -15508,11 +14890,38 @@ "Default": null }, { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", "Aliases": [ - "maxtrain" + "textkv" ], "Required": false, "SortOrder": 150.0, @@ -15550,9 +14959,10 @@ "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -15563,40 +14973,161 @@ "IsNullable": false }, { - "Name": "UseCdf", - "Type": "Bool", - "Desc": "Whether to use CDF as the output", + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep per column when auto-training", "Aliases": [ - "cdf" + "max" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": false + "Default": 1000000 }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 106.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 113.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 114.0, "IsNullable": false, - "Default": true + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "MaxTrainingExamples", + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.FeatureCombiner", + "Desc": "Combines all the features into one feature column.", + "FriendlyName": "Feature Combiner", + "ShortName": "fc", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Features", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Features", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.FeatureSelectorByCount", + "Desc": "Selects the slots for which the count of non-default values is greater than or equal to a threshold.", + "FriendlyName": "Count Feature Selection Transform", + "ShortName": null, + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to use for feature selection", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Count", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved", "Aliases": [ - "maxtrain" + "c" ], - "Required": false, - "SortOrder": 150.0, + "Required": true, + "SortOrder": 1.0, "IsNullable": false, - "Default": 1000000000 + "Default": 1 + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false } ], "Outputs": [ @@ -15619,70 +15150,18 @@ ] }, { - "Name": "Transforms.MinMaxNormalizer", - "Desc": "Normalizes the data based on the observed minimum and maximum values of the data.", - "FriendlyName": "Min-Max Normalizer", - "ShortName": "MinMax", + "Name": "Transforms.FeatureSelectorByMutualInformation", + "Desc": "Selects the top k slots across all specified columns ordered by their mutual information with the label column.", + "FriendlyName": "Mutual Information Feature Selection Transform", + "ShortName": "MIFeatureSelection", "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "ItemType": "String" }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Columns to use for feature selection", "Aliases": [ "col" ], @@ -15690,6 +15169,19 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "SlotsInOutput", + "Type": "Int", + "Desc": "The maximum number of slots to preserve in output", + "Aliases": [ + "topk", + "numSlotsToKeep" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 1000 + }, { "Name": "Data", "Type": "DataView", @@ -15699,28 +15191,28 @@ "IsNullable": false }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "zero" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": true + "Default": "Label" }, { - "Name": "MaxTrainingExamples", + "Name": "NumBins", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "Max number of bins for R4/R8 columns, power of 2 recommended", "Aliases": [ - "maxtrain" + "bins" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000000 + "Default": 256 } ], "Outputs": [ @@ -15743,10 +15235,10 @@ ] }, { - "Name": "Transforms.MissingValueHandler", - "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.", - "FriendlyName": "NA Handle Transform", - "ShortName": "NAHandle", + "Name": "Transforms.GlobalContrastNormalizer", + "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.", + "FriendlyName": "Global Contrast Normalization Transform", + "ShortName": "Gcn", "Inputs": [ { "Name": "Column", @@ -15756,45 +15248,27 @@ "Kind": "Struct", "Fields": [ { - "Name": "Kind", - "Type": { - "Kind": "Enum", - "Values": [ - "Default", - "Def", - "DefaultValue", - "Mean", - "Minimum", - "Min", - "Maximum", - "Max" - ] - }, - "Desc": "The replacement method to utilize", + "Name": "UseStdDev", + "Type": "Bool", + "Desc": "Normalize by standard deviation rather than L2 norm", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "ImputeBySlot", - "Type": "Bool", - "Desc": "Whether to impute values by slot", - "Aliases": [ - "slot" - ], + "Name": "Scale", + "Type": "Float", + "Desc": "Scale features by this value", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "ConcatIndicator", + "Name": "SubMean", "Type": "Bool", - "Desc": "Whether or not to concatenate an indicator vector column to the value column", - "Aliases": [ - "ind" - ], + "Desc": "Subtract mean from each value before normalizing", "Required": false, "SortOrder": 150.0, "IsNullable": true, @@ -15827,13 +15301,23 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:rep:src)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null + }, + { + "Name": "SubMean", + "Type": "Bool", + "Desc": "Subtract mean from each value before normalizing", + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": true }, { "Name": "Data", @@ -15844,52 +15328,25 @@ "IsNullable": false }, { - "Name": "ReplaceWith", - "Type": { - "Kind": "Enum", - "Values": [ - "Default", - "Def", - "DefaultValue", - "Mean", - "Minimum", - "Min", - "Maximum", - "Max" - ] - }, - "Desc": "The replacement method to utilize", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": "Def" - }, - { - "Name": "ImputeBySlot", + "Name": "UseStdDev", "Type": "Bool", - "Desc": "Whether to impute values by slot", + "Desc": "Normalize by standard deviation rather than L2 norm", "Aliases": [ - "slot" + "useStd" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "Concat", - "Type": "Bool", - "Desc": "Whether or not to concatenate an indicator vector column to the value column", - "Aliases": [ - "ind" - ], + "Name": "Scale", + "Type": "Float", + "Desc": "Scale features by this value", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1.0 } ], "Outputs": [ @@ -15912,10 +15369,10 @@ ] }, { - "Name": "Transforms.MissingValueIndicator", - "Desc": "Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.", - "FriendlyName": "NA Indicator Transform", - "ShortName": "NAInd", + "Name": "Transforms.HashConverter", + "Desc": "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.", + "FriendlyName": "Hash Join Transform", + "ShortName": "HashJoin", "Inputs": [ { "Name": "Column", @@ -15924,6 +15381,57 @@ "ItemType": { "Kind": "Struct", "Fields": [ + { + "Name": "Join", + "Type": "Bool", + "Desc": "Whether the values need to be combined for a single hash", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "CustomSlotMap", + "Type": "String", + "Desc": "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "HashBits", + "Type": "Int", + "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", + "Aliases": [ + "bits" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Ordered", + "Type": "Bool", + "Desc": "Whether the position of each term should be included in the hash", + "Aliases": [ + "ord" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "Name", "Type": "String", @@ -15966,6 +15474,48 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "HashBits", + "Type": "Int", + "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", + "Aliases": [ + "bits" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 31 + }, + { + "Name": "Join", + "Type": "Bool", + "Desc": "Whether the values need to be combined for a single hash", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 314489979 + }, + { + "Name": "Ordered", + "Type": "Bool", + "Desc": "Whether the position of each term should be included in the hash", + "Aliases": [ + "ord" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -15988,10 +15538,10 @@ ] }, { - "Name": "Transforms.MissingValuesDropper", - "Desc": "Removes NAs from vector columns.", - "FriendlyName": "NA Drop Transform", - "ShortName": "NADrop", + "Name": "Transforms.KeyToTextConverter", + "Desc": "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", + "FriendlyName": "Key To Value Transform", + "ShortName": null, "Inputs": [ { "Name": "Column", @@ -16027,7 +15577,7 @@ ] } }, - "Desc": "Columns to drop the NAs for", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], @@ -16064,41 +15614,35 @@ ] }, { - "Name": "Transforms.MissingValuesRowDropper", - "Desc": "Filters out rows that contain missing values.", - "FriendlyName": "NA Filter", - "ShortName": "NAFilter", + "Name": "Transforms.LabelColumnKeyBooleanConverter", + "Desc": "Transforms the label to either key or bool (if needed) to make it suitable for classification.", + "FriendlyName": "Prepare Classification Label", + "ShortName": null, "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Column", - "Aliases": [ - "col" - ], + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", + "Name": "LabelColumn", + "Type": "String", + "Desc": "The label column", "Required": true, - "SortOrder": 1.0, + "SortOrder": 2.0, "IsNullable": false }, { - "Name": "Complement", + "Name": "TextKeyValues", "Type": "Bool", - "Desc": "If true, keep only rows that contain NA values, and filter the rest.", + "Desc": "Convert the key values to text", "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": false + "Default": true } ], "Outputs": [ @@ -16121,10 +15665,10 @@ ] }, { - "Name": "Transforms.MissingValueSubstitutor", - "Desc": "Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).", - "FriendlyName": "NA Replace Transform", - "ShortName": "NARep", + "Name": "Transforms.LabelIndicator", + "Desc": "Label remapper used by OVA", + "FriendlyName": "LabelIndicator", + "ShortName": "LabelIndictator", "Inputs": [ { "Name": "Column", @@ -16134,48 +15678,15 @@ "Kind": "Struct", "Fields": [ { - "Name": "ReplacementString", - "Type": "String", - "Desc": "Replacement value for NAs (uses default value if not given)", + "Name": "ClassIndex", + "Type": "Int", + "Desc": "The positive example class for binary classification.", "Aliases": [ - "rep" + "index" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Kind", - "Type": { - "Kind": "Enum", - "Values": [ - "Default", - "DefaultValue", - "Def", - "Mean", - "Min", - "Minimum", - "Max", - "Maximum", - "SpecifiedValue", - "Val", - "Value" - ] - }, - "Desc": "The replacement method to utilize", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Slot", - "Type": "Bool", - "Desc": "Whether to impute values by slot", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": true, "Default": null }, { @@ -16205,13 +15716,14 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:rep:src)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -16222,43 +15734,16 @@ "IsNullable": false }, { - "Name": "ReplacementKind", - "Type": { - "Kind": "Enum", - "Values": [ - "Default", - "DefaultValue", - "Def", - "Mean", - "Min", - "Minimum", - "Max", - "Maximum", - "SpecifiedValue", - "Val", - "Value" - ] - }, - "Desc": "The replacement method to utilize", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "Def" - }, - { - "Name": "ImputeBySlot", - "Type": "Bool", - "Desc": "Whether to impute values by slot", + "Name": "ClassIndex", + "Type": "Int", + "Desc": "Label of the positive class.", "Aliases": [ - "slot" + "index" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 0 } ], "Outputs": [ @@ -16281,37 +15766,52 @@ ] }, { - "Name": "Transforms.ModelCombiner", - "Desc": "Combines a sequence of TransformModels into a single model", - "FriendlyName": null, + "Name": "Transforms.LabelToFloatConverter", + "Desc": "Transforms the label to float to make it suitable for regression.", + "FriendlyName": "Prepare Regression Label", "ShortName": null, "Inputs": [ { - "Name": "Models", - "Type": { - "Kind": "Array", - "ItemType": "TransformModel" - }, - "Desc": "Input models", - "Required": false, + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "The label column", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false } ], "Outputs": [ { - "Name": "OutputModel", + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", "Type": "TransformModel", - "Desc": "Combined model" + "Desc": "Transform model" } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" ] }, { - "Name": "Transforms.NGramTranslator", - "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.", - "FriendlyName": "NGram Transform", - "ShortName": "NgramTransform", + "Name": "Transforms.LogMeanVarianceNormalizer", + "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.", + "FriendlyName": "LogMeanVar Normalizer", + "ShortName": "LogMeanVar", "Inputs": [ { "Name": "Column", @@ -16321,69 +15821,14 @@ "Kind": "Struct", "Fields": [ { - "Name": "NgramLength", - "Type": "Int", - "Desc": "Maximum ngram length", - "Aliases": [ - "ngram" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "AllLengths", - "Type": "Bool", - "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength", - "Aliases": [ - "all" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "SkipLength", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", - "Aliases": [ - "skips" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxNumTerms", - "Type": { - "Kind": "Array", - "ItemType": "Int" - }, - "Desc": "Maximum number of ngrams to store in the dictionary", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "max" + "maxtrain" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Weighting", - "Type": { - "Kind": "Enum", - "Values": [ - "Tf", - "Idf", - "TfIdf" - ] - }, - "Desc": "Statistical measure used to evaluate how important a word is to a document in a corpus", - "Required": false, - "SortOrder": 150.0, "IsNullable": true, "Default": null }, @@ -16432,23 +15877,11 @@ "IsNullable": false }, { - "Name": "NgramLength", - "Type": "Int", - "Desc": "Maximum ngram length", - "Aliases": [ - "ngram" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 2 - }, - { - "Name": "AllLengths", + "Name": "UseCdf", "Type": "Bool", - "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength", + "Desc": "Whether to use CDF as the output", "Aliases": [ - "all" + "cdf" ], "Required": false, "SortOrder": 150.0, @@ -16456,49 +15889,16 @@ "Default": true }, { - "Name": "SkipLength", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", - "Aliases": [ - "skips" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "MaxNumTerms", - "Type": { - "Kind": "Array", - "ItemType": "Int" - }, - "Desc": "Maximum number of ngrams to store in the dictionary", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "max" + "maxtrain" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": [ - 10000000 - ] - }, - { - "Name": "Weighting", - "Type": { - "Kind": "Enum", - "Values": [ - "Tf", - "Idf", - "TfIdf" - ] - }, - "Desc": "The weighting criteria", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "Tf" + "Default": 1000000000 } ], "Outputs": [ @@ -16521,19 +15921,119 @@ ] }, { - "Name": "Transforms.NoOperation", - "Desc": "Does nothing.", - "FriendlyName": "No Op", - "ShortName": "Nop", + "Name": "Transforms.LpNormalizer", + "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.", + "FriendlyName": "Lp-Norm Normalizer", + "ShortName": "lpnorm", "Inputs": [ { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "NormKind", + "Type": { + "Kind": "Enum", + "Values": [ + "L2Norm", + "StdDev", + "L1Norm", + "LInf" + ] + }, + "Desc": "The norm to use to normalize each sample", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "SubMean", + "Type": "Bool", + "Desc": "Subtract mean from each value before normalizing", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "NormKind", + "Type": { + "Kind": "Enum", + "Values": [ + "L2Norm", + "StdDev", + "L1Norm", + "LInf" + ] + }, + "Desc": "The norm to use to normalize each sample", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": "L2Norm" + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "SubMean", + "Type": "Bool", + "Desc": "Subtract mean from each value before normalizing", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": false + } ], "Outputs": [ { @@ -16555,58 +16055,44 @@ ] }, { - "Name": "Transforms.OptionalColumnCreator", - "Desc": "If the source column does not exist after deserialization, create a column with the right type and default values.", - "FriendlyName": "Optional Column Transform", - "ShortName": "optional", + "Name": "Transforms.ManyHeterogeneousModelCombiner", + "Desc": "Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.", + "FriendlyName": null, + "ShortName": null, "Inputs": [ { - "Name": "Column", + "Name": "TransformModels", "Type": { "Kind": "Array", - "ItemType": "String" + "ItemType": "TransformModel" }, - "Desc": "New column definition(s)", - "Aliases": [ - "col" - ], + "Desc": "Transform model", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model", "Required": true, - "SortOrder": 1.0, + "SortOrder": 2.0, "IsNullable": false } ], "Outputs": [ { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model" } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.PcaCalculator", - "Desc": "Train an PCA Anomaly model.", - "FriendlyName": "Principal Component Analysis Transform", - "ShortName": "Pca", + "Name": "Transforms.MeanVarianceNormalizer", + "Desc": "Normalizes the data based on the computed mean and variance of the data.", + "FriendlyName": "MeanVar Normalizer", + "ShortName": "MeanVar", "Inputs": [ { "Name": "Column", @@ -16616,47 +16102,11 @@ "Kind": "Struct", "Fields": [ { - "Name": "WeightColumn", - "Type": "String", - "Desc": "The name of the weight column", - "Aliases": [ - "weight" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Rank", - "Type": "Int", - "Desc": "The number of components in the PCA", - "Aliases": [ - "k" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Oversampling", - "Type": "Int", - "Desc": "Oversampling parameter for randomized PCA training", - "Aliases": [ - "over" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Center", + "Name": "FixZero", "Type": "Bool", - "Desc": "If enabled, data is centered to be zero mean", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "center" + "zero" ], "Required": false, "SortOrder": 150.0, @@ -16664,11 +16114,11 @@ "Default": null }, { - "Name": "Seed", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "The seed for random number generation", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "seed" + "maxtrain" ], "Required": false, "SortOrder": 150.0, @@ -16719,58 +16169,40 @@ "IsNullable": false }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "The name of the weight column", + "Name": "UseCdf", + "Type": "Bool", + "Desc": "Whether to use CDF as the output", "Aliases": [ - "weight" + "cdf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "Rank", - "Type": "Int", - "Desc": "The number of components in the PCA", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "k" + "zero" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 20 + "Default": true }, { - "Name": "Oversampling", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "Oversampling parameter for randomized PCA training", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "over" + "maxtrain" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 20 - }, - { - "Name": "Center", - "Type": "Bool", - "Desc": "If enabled, data is centered to be zero mean", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "Seed", - "Type": "Int", - "Desc": "The seed for random number generation", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 + "Default": 1000000000 } ], "Outputs": [ @@ -16793,52 +16225,10 @@ ] }, { - "Name": "Transforms.PredictedLabelColumnOriginalValueConverter", - "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.", - "FriendlyName": "Convert Predicted Label", - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "PredictedLabelColumn", - "Type": "String", - "Desc": "The predicted label column", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.RandomNumberGenerator", - "Desc": "Adds a column with a generated number sequence.", - "FriendlyName": "Generate Number Transform", - "ShortName": "Generate", + "Name": "Transforms.MinMaxNormalizer", + "Desc": "Normalizes the data based on the observed minimum and maximum values of the data.", + "FriendlyName": "Min-Max Normalizer", + "ShortName": "MinMax", "Inputs": [ { "Name": "Column", @@ -16848,23 +16238,23 @@ "Kind": "Struct", "Fields": [ { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "name" + "zero" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": true, "Default": null }, { - "Name": "UseCounter", - "Type": "Bool", - "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "cnt" + "maxtrain" ], "Required": false, "SortOrder": 150.0, @@ -16872,18 +16262,33 @@ "Default": null }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed", + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, "Default": null } ] } }, - "Desc": "New column definition(s) (optional form: name:seed)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], @@ -16900,25 +16305,28 @@ "IsNullable": false }, { - "Name": "UseCounter", + "Name": "FixZero", "Type": "Bool", - "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "cnt" + "zero" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed", + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 42 + "Default": 1000000000 } ], "Outputs": [ @@ -16941,15 +16349,91 @@ ] }, { - "Name": "Transforms.RowRangeFilter", - "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.", - "FriendlyName": "Range Filter", - "ShortName": "RangeFilter", + "Name": "Transforms.MissingValueHandler", + "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.", + "FriendlyName": "NA Handle Transform", + "ShortName": "NAHandle", "Inputs": [ { "Name": "Column", - "Type": "String", - "Desc": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Kind", + "Type": { + "Kind": "Enum", + "Values": [ + "Default", + "Def", + "DefaultValue", + "Mean", + "Minimum", + "Min", + "Maximum", + "Max" + ] + }, + "Desc": "The replacement method to utilize", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ImputeBySlot", + "Type": "Bool", + "Desc": "Whether to impute values by slot", + "Aliases": [ + "slot" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ConcatIndicator", + "Type": "Bool", + "Desc": "Whether or not to concatenate an indicator vector column to the value column", + "Aliases": [ + "ind" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:rep:src)", "Aliases": [ "col" ], @@ -16966,49 +16450,52 @@ "IsNullable": false }, { - "Name": "Min", - "Type": "Float", - "Desc": "Minimum value (0 to 1 for key types)", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Max", - "Type": "Float", - "Desc": "Maximum value (0 to 1 for key types)", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Complement", - "Type": "Bool", - "Desc": "If true, keep the values that fall outside the range.", + "Name": "ReplaceWith", + "Type": { + "Kind": "Enum", + "Values": [ + "Default", + "Def", + "DefaultValue", + "Mean", + "Minimum", + "Min", + "Maximum", + "Max" + ] + }, + "Desc": "The replacement method to utilize", + "Aliases": [ + "kind" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": false + "Default": "Def" }, { - "Name": "IncludeMin", + "Name": "ImputeBySlot", "Type": "Bool", - "Desc": "If true, include in the range the values that are equal to min.", + "Desc": "Whether to impute values by slot", + "Aliases": [ + "slot" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": true }, { - "Name": "IncludeMax", + "Name": "Concat", "Type": "Bool", - "Desc": "If true, include in the range the values that are equal to max.", + "Desc": "Whether or not to concatenate an indicator vector column to the value column", + "Aliases": [ + "ind" + ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -17031,22 +16518,52 @@ ] }, { - "Name": "Transforms.RowSkipAndTakeFilter", - "Desc": "Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging.", - "FriendlyName": "Skip and Take Filter", - "ShortName": "SkipTake", + "Name": "Transforms.MissingValueIndicator", + "Desc": "Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.", + "FriendlyName": "NA Indicator Transform", + "ShortName": "NAInd", "Inputs": [ { - "Name": "Skip", - "Type": "Int", - "Desc": "Number of items to skip", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ - "s" + "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": true, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -17055,18 +16572,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "Take", - "Type": "Int", - "Desc": "Number of items to take", - "Aliases": [ - "t" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null } ], "Outputs": [ @@ -17089,24 +16594,52 @@ ] }, { - "Name": "Transforms.RowSkipFilter", - "Desc": "Allows limiting input to a subset of rows by skipping a number of rows.", - "FriendlyName": "Skip Filter", - "ShortName": "Skip", + "Name": "Transforms.MissingValuesDropper", + "Desc": "Removes NAs from vector columns.", + "FriendlyName": "NA Drop Transform", + "ShortName": "NADrop", "Inputs": [ { - "Name": "Count", - "Type": "Int", - "Desc": "Number of items to skip", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "Columns to drop the NAs for", "Aliases": [ - "c", - "n", - "s" + "col" ], "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": 0 + "IsNullable": false }, { "Name": "Data", @@ -17137,24 +16670,24 @@ ] }, { - "Name": "Transforms.RowTakeFilter", - "Desc": "Allows limiting input to a subset of rows by taking N first rows.", - "FriendlyName": "Take Filter", - "ShortName": "Take", + "Name": "Transforms.MissingValuesRowDropper", + "Desc": "Filters out rows that contain missing values.", + "FriendlyName": "NA Filter", + "ShortName": "NAFilter", "Inputs": [ { - "Name": "Count", - "Type": "Int", - "Desc": "Number of items to take", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Column", "Aliases": [ - "c", - "n", - "t" + "col" ], "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": 9223372036854775807 + "IsNullable": false }, { "Name": "Data", @@ -17163,6 +16696,15 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "Complement", + "Type": "Bool", + "Desc": "If true, keep only rows that contain NA values, and filter the rest.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false } ], "Outputs": [ @@ -17185,11 +16727,98 @@ ] }, { - "Name": "Transforms.ScoreColumnSelector", - "Desc": "Selects only the last score columns and the extra columns specified in the arguments.", - "FriendlyName": "Choose Columns By Index", - "ShortName": null, + "Name": "Transforms.MissingValueSubstitutor", + "Desc": "Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).", + "FriendlyName": "NA Replace Transform", + "ShortName": "NARep", "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "ReplacementString", + "Type": "String", + "Desc": "Replacement value for NAs (uses default value if not given)", + "Aliases": [ + "rep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Kind", + "Type": { + "Kind": "Enum", + "Values": [ + "Default", + "DefaultValue", + "Def", + "Mean", + "Min", + "Minimum", + "Max", + "Maximum", + "SpecifiedValue", + "Val", + "Value" + ] + }, + "Desc": "The replacement method to utilize", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Slot", + "Type": "Bool", + "Desc": "Whether to impute values by slot", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:rep:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, { "Name": "Data", "Type": "DataView", @@ -17199,108 +16828,43 @@ "IsNullable": false }, { - "Name": "ExtraColumns", + "Name": "ReplacementKind", "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Extra columns to write", - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": null - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.Scorer", - "Desc": "Turn the predictor model into a transform model", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The predictor model to turn into a transform", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "ScoredData", - "Type": "DataView", - "Desc": "The scored dataset" - }, - { - "Name": "ScoringTransform", - "Type": "TransformModel", - "Desc": "The scoring transform" - } - ] - }, - { - "Name": "Transforms.Segregator", - "Desc": "Un-groups vector columns into sequences of rows, inverse of Group transform", - "FriendlyName": "Un-group Transform", - "ShortName": "Ungroup", - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" + "Kind": "Enum", + "Values": [ + "Default", + "DefaultValue", + "Def", + "Mean", + "Min", + "Minimum", + "Max", + "Maximum", + "SpecifiedValue", + "Val", + "Value" + ] }, - "Desc": "Columns to unroll, or 'pivot'", + "Desc": "The replacement method to utilize", "Aliases": [ - "col" + "kind" ], - "Required": true, + "Required": false, "SortOrder": 150.0, - "IsNullable": false + "IsNullable": false, + "Default": "Def" }, { - "Name": "Mode", - "Type": { - "Kind": "Enum", - "Values": [ - "Inner", - "Outer", - "First" - ] - }, - "Desc": "Specifies how to unroll multiple pivot columns of different size.", + "Name": "ImputeBySlot", + "Type": "Bool", + "Desc": "Whether to impute values by slot", + "Aliases": [ + "slot" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Inner" + "Default": true } ], "Outputs": [ @@ -17323,67 +16887,37 @@ ] }, { - "Name": "Transforms.SentimentAnalyzer", - "Desc": "Uses a pretrained sentiment model to score input strings", - "FriendlyName": "Sentiment Analyzing Transform", - "ShortName": "Senti", + "Name": "Transforms.ModelCombiner", + "Desc": "Combines a sequence of TransformModels into a single model", + "FriendlyName": null, + "ShortName": null, "Inputs": [ { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column.", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column.", - "Aliases": [ - "dst" - ], + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "TransformModel" + }, + "Desc": "Input models", "Required": false, - "SortOrder": 2.0, + "SortOrder": 1.0, "IsNullable": false, "Default": null } ], "Outputs": [ { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", + "Name": "OutputModel", "Type": "TransformModel", - "Desc": "Transform model" + "Desc": "Combined model" } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.SupervisedBinNormalizer", - "Desc": "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins.", - "FriendlyName": "Supervised Binning Normalizer", - "ShortName": "SupBin", + "Name": "Transforms.NGramTranslator", + "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.", + "FriendlyName": "NGram Transform", + "ShortName": "NgramTransform", "Inputs": [ { "Name": "Column", @@ -17393,11 +16927,11 @@ "Kind": "Struct", "Fields": [ { - "Name": "NumBins", + "Name": "NgramLength", "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", + "Desc": "Maximum ngram length", "Aliases": [ - "bins" + "ngram" ], "Required": false, "SortOrder": 150.0, @@ -17405,11 +16939,11 @@ "Default": null }, { - "Name": "FixZero", + "Name": "AllLengths", "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength", "Aliases": [ - "zero" + "all" ], "Required": false, "SortOrder": 150.0, @@ -17417,14 +16951,45 @@ "Default": null }, { - "Name": "MaxTrainingExamples", + "Name": "SkipLength", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "Maximum number of tokens to skip when constructing an ngram", "Aliases": [ - "maxtrain" + "skips" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxNumTerms", + "Type": { + "Kind": "Array", + "ItemType": "Int" + }, + "Desc": "Maximum number of ngrams to store in the dictionary", + "Aliases": [ + "max" ], "Required": false, "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Weighting", + "Type": { + "Kind": "Enum", + "Values": [ + "Tf", + "Idf", + "TfIdf" + ] + }, + "Desc": "Statistical measure used to evaluate how important a word is to a document in a corpus", + "Required": false, + "SortOrder": 150.0, "IsNullable": true, "Default": null }, @@ -17473,61 +17038,73 @@ "IsNullable": false }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Label column for supervised binning", + "Name": "NgramLength", + "Type": "Int", + "Desc": "Maximum ngram length", "Aliases": [ - "label", - "lab" + "ngram" ], - "Required": true, + "Required": false, "SortOrder": 150.0, - "IsNullable": false + "IsNullable": false, + "Default": 2 }, { - "Name": "MinBinSize", - "Type": "Int", - "Desc": "Minimum number of examples per bin", + "Name": "AllLengths", + "Type": "Bool", + "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength", + "Aliases": [ + "all" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 10 + "Default": true }, { - "Name": "NumBins", + "Name": "SkipLength", "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", + "Desc": "Maximum number of tokens to skip when constructing an ngram", "Aliases": [ - "bins" + "skips" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1024 + "Default": 0 }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Name": "MaxNumTerms", + "Type": { + "Kind": "Array", + "ItemType": "Int" + }, + "Desc": "Maximum number of ngrams to store in the dictionary", "Aliases": [ - "zero" + "max" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": [ + 10000000 + ] }, { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], + "Name": "Weighting", + "Type": { + "Kind": "Enum", + "Values": [ + "Tf", + "Idf", + "TfIdf" + ] + }, + "Desc": "The weighting criteria", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000000 + "Default": "Tf" } ], "Outputs": [ @@ -17550,46 +17127,52 @@ ] }, { - "Name": "Transforms.TextFeaturizer", - "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.", - "FriendlyName": "Text Transform", - "ShortName": "Text", + "Name": "Transforms.NoOperation", + "Desc": "Does nothing.", + "FriendlyName": "No Op", + "ShortName": "Nop", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.OptionalColumnCreator", + "Desc": "If the source column does not exist after deserialization, create a column with the right type and default values.", + "FriendlyName": "Optional Column Transform", + "ShortName": "optional", "Inputs": [ { "Name": "Column", "Type": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] + "Kind": "Array", + "ItemType": "String" }, - "Desc": "New column definition (optional form: name:srcs).", + "Desc": "New column definition(s)", "Aliases": [ "col" ], @@ -17604,234 +17187,238 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "Language", + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.PcaCalculator", + "Desc": "Train an PCA Anomaly model.", + "FriendlyName": "Principal Component Analysis Transform", + "ShortName": "Pca", + "Inputs": [ + { + "Name": "Column", "Type": { - "Kind": "Enum", - "Values": [ - "English", - "French", - "German", - "Dutch", - "Italian", - "Spanish", - "Japanese" - ] + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "WeightColumn", + "Type": "String", + "Desc": "The name of the weight column", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Rank", + "Type": "Int", + "Desc": "The number of components in the PCA", + "Aliases": [ + "k" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Oversampling", + "Type": "Int", + "Desc": "Oversampling parameter for randomized PCA training", + "Aliases": [ + "over" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Center", + "Type": "Bool", + "Desc": "If enabled, data is centered to be zero mean", + "Aliases": [ + "center" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Seed", + "Type": "Int", + "Desc": "The seed for random number generation", + "Aliases": [ + "seed" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } }, - "Desc": "Dataset language or 'AutoDetect' to detect language per row.", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ - "lang" + "col" ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": "English" + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "StopWordsRemover", - "Type": { - "Kind": "Component", - "ComponentKind": "StopWordsRemover" - }, - "Desc": "Stopwords remover.", - "Aliases": [ - "remover" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": null + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "TextCase", - "Type": { - "Kind": "Enum", - "Values": [ - "Lower", - "Upper", - "None" - ] - }, - "Desc": "Casing text using the rules of the invariant culture.", + "Name": "WeightColumn", + "Type": "String", + "Desc": "The name of the weight column", "Aliases": [ - "case" + "weight" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Lower" + "Default": null }, { - "Name": "KeepDiacritics", - "Type": "Bool", - "Desc": "Whether to keep diacritical marks or remove them.", + "Name": "Rank", + "Type": "Int", + "Desc": "The number of components in the PCA", "Aliases": [ - "diac" + "k" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 20 }, { - "Name": "KeepPunctuations", - "Type": "Bool", - "Desc": "Whether to keep punctuation marks or remove them.", + "Name": "Oversampling", + "Type": "Int", + "Desc": "Oversampling parameter for randomized PCA training", "Aliases": [ - "punc" + "over" ], "Required": false, - "SortOrder": 7.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 20 }, { - "Name": "KeepNumbers", + "Name": "Center", "Type": "Bool", - "Desc": "Whether to keep numbers or remove them.", - "Aliases": [ - "num" - ], + "Desc": "If enabled, data is centered to be zero mean", "Required": false, - "SortOrder": 8.0, + "SortOrder": 150.0, "IsNullable": false, "Default": true }, { - "Name": "OutputTokens", - "Type": "Bool", - "Desc": "Whether to output the transformed text tokens as an additional column.", - "Aliases": [ - "tokens", - "showtext", - "showTransformedText" - ], + "Name": "Seed", + "Type": "Int", + "Desc": "The seed for random number generation", "Required": false, - "SortOrder": 9.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false - }, + "Default": 0 + } + ], + "Outputs": [ { - "Name": "Dictionary", - "Type": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": "Occurrence" - }, - { - "Name": "DropUnknowns", - "Type": "Bool", - "Desc": "Drop unknown terms instead of mapping them to NA term.", - "Aliases": [ - "dropna" - ], - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": false - } - ] - }, - "Desc": "A dictionary of whitelisted terms.", - "Aliases": [ - "dict" - ], - "Required": false, - "SortOrder": 10.0, - "IsNullable": false, - "Default": null + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "WordFeatureExtractor", - "Type": { - "Kind": "Component", - "ComponentKind": "NgramExtractor" - }, - "Desc": "Ngram feature extractor to use for words (WordBag/WordHashBag).", - "Aliases": [ - "wordExtractor" - ], - "Required": false, - "SortOrder": 11.0, - "IsNullable": false, - "Default": { - "Name": "NGram", - "Settings": { - "MaxNumTerms": [ - 10000000 - ] - } - } - }, + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.PredictedLabelColumnOriginalValueConverter", + "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.", + "FriendlyName": "Convert Predicted Label", + "ShortName": null, + "Inputs": [ { - "Name": "CharFeatureExtractor", - "Type": { - "Kind": "Component", - "ComponentKind": "NgramExtractor" - }, - "Desc": "Ngram feature extractor to use for characters (WordBag/WordHashBag).", - "Aliases": [ - "charExtractor" - ], - "Required": false, - "SortOrder": 12.0, - "IsNullable": false, - "Default": { - "Name": "NGram", - "Settings": { - "NgramLength": 3, - "AllLengths": false, - "MaxNumTerms": [ - 10000000 - ] - } - } + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "VectorNormalizer", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "L1", - "L2", - "LInf" - ] - }, - "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 13.0, - "IsNullable": false, - "Default": "L2" + "Name": "PredictedLabelColumn", + "Type": "String", + "Desc": "The predicted label column", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false } ], "Outputs": [ @@ -17854,10 +17441,10 @@ ] }, { - "Name": "Transforms.TextToKeyConverter", - "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", - "FriendlyName": "Term Transform", - "ShortName": null, + "Name": "Transforms.RandomNumberGenerator", + "Desc": "Adds a column with a generated number sequence.", + "FriendlyName": "Generate Number Transform", + "ShortName": "Generate", "Inputs": [ { "Name": "Column", @@ -17867,91 +17454,48 @@ "Kind": "Struct", "Fields": [ { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep when auto-training", + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", "Aliases": [ - "max" + "name" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Name": "UseCounter", + "Type": "Bool", + "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Aliases": [ + "cnt" + ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", - "Aliases": [ - "textkv" - ], + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null } ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:seed)", "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -17962,55 +17506,115 @@ "IsNullable": false }, { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Name": "UseCounter", + "Type": "Bool", + "Desc": "Use an auto-incremented integer starting at zero instead of a random number", "Aliases": [ - "max" + "cnt" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": false }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed", "Required": false, - "SortOrder": 106.0, + "SortOrder": 150.0, "IsNullable": false, + "Default": 42 + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.RowRangeFilter", + "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.", + "FriendlyName": "Range Filter", + "ShortName": "RangeFilter", + "Inputs": [ + { + "Name": "Column", + "Type": "String", + "Desc": "Column", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Min", + "Type": "Float", + "Desc": "Minimum value (0 to 1 for key types)", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, "Default": null }, { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Name": "Max", + "Type": "Float", + "Desc": "Maximum value (0 to 1 for key types)", "Required": false, - "SortOrder": 113.0, - "IsNullable": false, - "Default": "Occurrence" + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "TextKeyValues", + "Name": "Complement", "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", - "Aliases": [ - "textkv" - ], + "Desc": "If true, keep the values that fall outside the range.", "Required": false, - "SortOrder": 114.0, + "SortOrder": 150.0, "IsNullable": false, "Default": false + }, + { + "Name": "IncludeMin", + "Type": "Bool", + "Desc": "If true, include in the range the values that are equal to min.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "IncludeMax", + "Type": "Bool", + "Desc": "If true, include in the range the values that are equal to max.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null } ], "Outputs": [ @@ -18033,11 +17637,23 @@ ] }, { - "Name": "Transforms.TrainTestDatasetSplitter", - "Desc": "Split the dataset into train and test sets", - "FriendlyName": "Dataset Train-Test Split", - "ShortName": null, + "Name": "Transforms.RowSkipAndTakeFilter", + "Desc": "Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging.", + "FriendlyName": "Skip and Take Filter", + "ShortName": "SkipTake", "Inputs": [ + { + "Name": "Skip", + "Type": "Int", + "Desc": "Number of items to skip", + "Aliases": [ + "s" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null + }, { "Name": "Data", "Type": "DataView", @@ -18047,726 +17663,2364 @@ "IsNullable": false }, { - "Name": "Fraction", - "Type": "Float", - "Desc": "Fraction of training data", - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": 0.8 - }, - { - "Name": "StratificationColumn", - "Type": "String", - "Desc": "Stratification column", + "Name": "Take", + "Type": "Int", + "Desc": "Number of items to take", "Aliases": [ - "strat" + "t" ], "Required": false, - "SortOrder": 3.0, - "IsNullable": false, + "SortOrder": 2.0, + "IsNullable": true, "Default": null } ], "Outputs": [ { - "Name": "TrainData", - "Type": "DataView", - "Desc": "Training data" + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.RowSkipFilter", + "Desc": "Allows limiting input to a subset of rows by skipping a number of rows.", + "FriendlyName": "Skip Filter", + "ShortName": "Skip", + "Inputs": [ + { + "Name": "Count", + "Type": "Int", + "Desc": "Number of items to skip", + "Aliases": [ + "c", + "n", + "s" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.RowTakeFilter", + "Desc": "Allows limiting input to a subset of rows by taking N first rows.", + "FriendlyName": "Take Filter", + "ShortName": "Take", + "Inputs": [ + { + "Name": "Count", + "Type": "Int", + "Desc": "Number of items to take", + "Aliases": [ + "c", + "n", + "t" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9223372036854775807 + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.ScoreColumnSelector", + "Desc": "Selects only the last score columns and the extra columns specified in the arguments.", + "FriendlyName": "Choose Columns By Index", + "ShortName": null, + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ExtraColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Extra columns to write", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.Scorer", + "Desc": "Turn the predictor model into a transform model", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model to turn into a transform", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "ScoredData", + "Type": "DataView", + "Desc": "The scored dataset" + }, + { + "Name": "ScoringTransform", + "Type": "TransformModel", + "Desc": "The scoring transform" + } + ] + }, + { + "Name": "Transforms.Segregator", + "Desc": "Un-groups vector columns into sequences of rows, inverse of Group transform", + "FriendlyName": "Un-group Transform", + "ShortName": "Ungroup", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to unroll, or 'pivot'", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "Mode", + "Type": { + "Kind": "Enum", + "Values": [ + "Inner", + "Outer", + "First" + ] + }, + "Desc": "Specifies how to unroll multiple pivot columns of different size.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "Inner" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.SentimentAnalyzer", + "Desc": "Uses a pretrained sentiment model to score input strings", + "FriendlyName": "Sentiment Analyzing Transform", + "ShortName": "Senti", + "Inputs": [ + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column.", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column.", + "Aliases": [ + "dst" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.SupervisedBinNormalizer", + "Desc": "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins.", + "FriendlyName": "Supervised Binning Normalizer", + "ShortName": "SupBin", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", + "Aliases": [ + "bins" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Label column for supervised binning", + "Aliases": [ + "label", + "lab" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "MinBinSize", + "Type": "Int", + "Desc": "Minimum number of examples per bin", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 10 + }, + { + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", + "Aliases": [ + "bins" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1024 + }, + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000000 + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TextFeaturizer", + "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.", + "FriendlyName": "Text Transform", + "ShortName": "Text", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + }, + "Desc": "New column definition (optional form: name:srcs).", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Language", + "Type": { + "Kind": "Enum", + "Values": [ + "English", + "French", + "German", + "Dutch", + "Italian", + "Spanish", + "Japanese" + ] + }, + "Desc": "Dataset language or 'AutoDetect' to detect language per row.", + "Aliases": [ + "lang" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "English" + }, + { + "Name": "StopWordsRemover", + "Type": { + "Kind": "Component", + "ComponentKind": "StopWordsRemover" + }, + "Desc": "Stopwords remover.", + "Aliases": [ + "remover" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "TextCase", + "Type": { + "Kind": "Enum", + "Values": [ + "Lower", + "Upper", + "None" + ] + }, + "Desc": "Casing text using the rules of the invariant culture.", + "Aliases": [ + "case" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Lower" + }, + { + "Name": "KeepDiacritics", + "Type": "Bool", + "Desc": "Whether to keep diacritical marks or remove them.", + "Aliases": [ + "diac" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "KeepPunctuations", + "Type": "Bool", + "Desc": "Whether to keep punctuation marks or remove them.", + "Aliases": [ + "punc" + ], + "Required": false, + "SortOrder": 7.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "KeepNumbers", + "Type": "Bool", + "Desc": "Whether to keep numbers or remove them.", + "Aliases": [ + "num" + ], + "Required": false, + "SortOrder": 8.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "OutputTokens", + "Type": "Bool", + "Desc": "Whether to output the transformed text tokens as an additional column.", + "Aliases": [ + "tokens", + "showtext", + "showTransformedText" + ], + "Required": false, + "SortOrder": 9.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "Dictionary", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "DropUnknowns", + "Type": "Bool", + "Desc": "Drop unknown terms instead of mapping them to NA term.", + "Aliases": [ + "dropna" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": false + } + ] + }, + "Desc": "A dictionary of whitelisted terms.", + "Aliases": [ + "dict" + ], + "Required": false, + "SortOrder": 10.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "WordFeatureExtractor", + "Type": { + "Kind": "Component", + "ComponentKind": "NgramExtractor" + }, + "Desc": "Ngram feature extractor to use for words (WordBag/WordHashBag).", + "Aliases": [ + "wordExtractor" + ], + "Required": false, + "SortOrder": 11.0, + "IsNullable": false, + "Default": { + "Name": "NGram", + "Settings": { + "MaxNumTerms": [ + 10000000 + ] + } + } + }, + { + "Name": "CharFeatureExtractor", + "Type": { + "Kind": "Component", + "ComponentKind": "NgramExtractor" + }, + "Desc": "Ngram feature extractor to use for characters (WordBag/WordHashBag).", + "Aliases": [ + "charExtractor" + ], + "Required": false, + "SortOrder": 12.0, + "IsNullable": false, + "Default": { + "Name": "NGram", + "Settings": { + "NgramLength": 3, + "AllLengths": false, + "MaxNumTerms": [ + 10000000 + ] + } + } + }, + { + "Name": "VectorNormalizer", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "L1", + "L2", + "LInf" + ] + }, + "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 13.0, + "IsNullable": false, + "Default": "L2" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TextToKeyConverter", + "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", + "FriendlyName": "Term Transform", + "ShortName": null, + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep when auto-training", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep per column when auto-training", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": 1000000 + }, + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 106.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 113.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 114.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TrainTestDatasetSplitter", + "Desc": "Split the dataset into train and test sets", + "FriendlyName": "Dataset Train-Test Split", + "ShortName": null, + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Fraction", + "Type": "Float", + "Desc": "Fraction of training data", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 0.8 + }, + { + "Name": "StratificationColumn", + "Type": "String", + "Desc": "Stratification column", + "Aliases": [ + "strat" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "TrainData", + "Type": "DataView", + "Desc": "Training data" + }, + { + "Name": "TestData", + "Type": "DataView", + "Desc": "Testing data" + } + ] + }, + { + "Name": "Transforms.TreeLeafFeaturizer", + "Desc": "Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.", + "FriendlyName": "Tree Ensemble Featurization Transform", + "ShortName": "TreeFeat", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Trainer to use", + "Required": true, + "SortOrder": 10.0, + "IsNullable": false + }, + { + "Name": "Suffix", + "Type": "String", + "Desc": "Output column: The suffix to append to the default column names", + "Aliases": [ + "ex" + ], + "Required": false, + "SortOrder": 101.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LabelPermutationSeed", + "Type": "Int", + "Desc": "If specified, determines the permutation seed for applying this featurizer to a multiclass problem.", + "Aliases": [ + "lps" + ], + "Required": false, + "SortOrder": 102.0, + "IsNullable": false, + "Default": 0 + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "IFeaturizerInput", + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TwoHeterogeneousModelCombiner", + "Desc": "Combines a TransformModel and a PredictorModel into a single PredictorModel.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "TransformModel", + "Type": "TransformModel", + "Desc": "Transform model", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model" + } + ] + }, + { + "Name": "Transforms.WordTokenizer", + "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.", + "FriendlyName": "Tokenize Text Transform", + "ShortName": "TokenizeTextTransform", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "TermSeparators", + "Type": "String", + "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", + "Aliases": [ + "sep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s)", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "TermSeparators", + "Type": "String", + "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", + "Aliases": [ + "sep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "space" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + } + ], + "Components": [ + { + "Kind": "AutoMlEngine", + "Components": [ + { + "Name": "Defaults", + "Desc": "AutoML engine that returns learners with default settings.", + "FriendlyName": "Defaults Engine", + "Settings": [] + }, + { + "Name": "Rocket", + "Desc": "AutoML engine that consists of distinct, hierarchical stages of operation.", + "FriendlyName": "Rocket Engine", + "Settings": [ + { + "Name": "TopKLearners", + "Type": "Int", + "Desc": "Number of learners to retain for second stage.", + "Aliases": [ + "topk" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 2 + }, + { + "Name": "SecondRoundTrialsPerLearner", + "Type": "Int", + "Desc": "Number of trials for retained second stage learners.", + "Aliases": [ + "stage2num" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 5 + }, + { + "Name": "RandomInitialization", + "Type": "Bool", + "Desc": "Use random initialization only.", + "Aliases": [ + "randinit" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "NumInitializationPipelines", + "Type": "Int", + "Desc": "Number of initilization pipelines, used for random initialization only.", + "Aliases": [ + "numinitseeds" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 20 + } + ] + }, + { + "Name": "UniformRandom", + "Desc": "AutoML engine using uniform random sampling.", + "FriendlyName": "Uniform Random Engine", + "Settings": [] + } + ] + }, + { + "Kind": "AutoMlStateBase", + "Components": [ + { + "Name": "AutoMlState", + "Desc": "State of an AutoML search and search space.", + "FriendlyName": "AutoML State", + "Aliases": [ + "automlst" + ], + "Settings": [ + { + "Name": "Metric", + "Type": { + "Kind": "Enum", + "Values": [ + "Auc", + "AccuracyMicro", + "AccuracyMacro", + "L2", + "F1", + "AuPrc", + "TopKAccuracy", + "Rms", + "LossFn", + "RSquared", + "LogLoss", + "LogLossReduction", + "Ndcg", + "Dcg", + "PositivePrecision", + "PositiveRecall", + "NegativePrecision", + "NegativeRecall", + "DrAtK", + "DrAtPFpr", + "DrAtNumPos", + "NumAnomalies", + "ThreshAtK", + "ThreshAtP", + "ThreshAtNumPos", + "Nmi", + "AvgMinScore", + "Dbi" + ] + }, + "Desc": "Supported metric for evaluator.", + "Aliases": [ + "metric" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "Auc" + }, + { + "Name": "Engine", + "Type": { + "Kind": "Component", + "ComponentKind": "AutoMlEngine" + }, + "Desc": "AutoML engine (pipeline optimizer) that generates next candidates.", + "Aliases": [ + "engine" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "TrainerKind", + "Type": { + "Kind": "Enum", + "Values": [ + "SignatureBinaryClassifierTrainer", + "SignatureMultiClassClassifierTrainer", + "SignatureRankerTrainer", + "SignatureRegressorTrainer", + "SignatureMultiOutputRegressorTrainer", + "SignatureAnomalyDetectorTrainer", + "SignatureClusteringTrainer" + ] + }, + "Desc": "Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc.", + "Aliases": [ + "tk" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "SignatureBinaryClassifierTrainer" + }, + { + "Name": "TerminatorArgs", + "Type": { + "Kind": "Component", + "ComponentKind": "SearchTerminator" + }, + "Desc": "Arguments for creating terminator, which determines when to stop search.", + "Aliases": [ + "term" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "RequestedLearners", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Learner set to sweep over (if available).", + "Aliases": [ + "learners" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + ] + }, + { + "Kind": "CalibratorTrainer", + "Components": [ + { + "Name": "FixedPlattCalibrator", + "Desc": null, + "FriendlyName": "Fixed Platt Calibrator", + "Aliases": [ + "FixedPlatt", + "FixedSigmoid" + ], + "Settings": [ + { + "Name": "Slope", + "Type": "Float", + "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Aliases": [ + "a" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "Offset", + "Type": "Float", + "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Aliases": [ + "b" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + } + ] + }, + { + "Name": "NaiveCalibrator", + "Desc": null, + "FriendlyName": "Naive Calibrator", + "Aliases": [ + "Naive" + ], + "Settings": [] + }, + { + "Name": "PavCalibrator", + "Desc": null, + "FriendlyName": "PAV Calibrator", + "Aliases": [ + "Pav" + ], + "Settings": [] + }, + { + "Name": "PlattCalibrator", + "Desc": "Platt calibration.", + "FriendlyName": "Platt Calibrator", + "Aliases": [ + "Platt", + "Sigmoid" + ], + "Settings": [] + } + ] + }, + { + "Kind": "ClassificationLossFunction", + "Components": [ + { + "Name": "ExpLoss", + "Desc": "Exponential loss.", + "FriendlyName": "Exponential Loss", + "Settings": [ + { + "Name": "Beta", + "Type": "Float", + "Desc": "Beta (dilation)", + "Aliases": [ + "beta" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + }, + { + "Name": "HingeLoss", + "Desc": "Hinge loss.", + "FriendlyName": "Hinge loss", + "Aliases": [ + "Hinge" + ], + "Settings": [ + { + "Name": "Margin", + "Type": "Float", + "Desc": "Margin value", + "Aliases": [ + "marg" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + }, + { + "Name": "LogLoss", + "Desc": "Log loss.", + "FriendlyName": "Log loss", + "Aliases": [ + "Logistic", + "CrossEntropy" + ], + "Settings": [] + }, + { + "Name": "SmoothedHingeLoss", + "Desc": "Smoothed Hinge loss.", + "FriendlyName": "Smoothed Hinge Loss", + "Aliases": [ + "SmoothedHinge" + ], + "Settings": [ + { + "Name": "SmoothingConst", + "Type": "Float", + "Desc": "Smoothing constant", + "Aliases": [ + "smooth" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + } + ] + }, + { + "Kind": "EarlyStoppingCriterion", + "Components": [ + { + "Name": "GL", + "Desc": "Stop in case of loss of generality.", + "FriendlyName": "Loss of Generality (GL)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + } + ] + }, + { + "Name": "LP", + "Desc": "Stops in case of low progress.", + "FriendlyName": "Low Progress (LP)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + }, + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + }, + { + "Name": "PQ", + "Desc": "Stops in case of generality to progress ration exceeds threshold.", + "FriendlyName": "Generality to Progress Ratio (PQ)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + }, + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + }, + { + "Name": "TR", + "Desc": "Stop if validation score exceeds threshold value.", + "FriendlyName": "Tolerant (TR)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Tolerance threshold. (Non negative value)", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Min": 0.0 + } + } + ] }, { - "Name": "TestData", - "Type": "DataView", - "Desc": "Testing data" + "Name": "UP", + "Desc": "Stops in case of consecutive loss in generality.", + "FriendlyName": "Consecutive Loss in Generality (UP)", + "Settings": [ + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] } ] }, { - "Name": "Transforms.TreeLeafFeaturizer", - "Desc": "Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.", - "FriendlyName": "Tree Ensemble Featurization Transform", - "ShortName": "TreeFeat", - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Trainer to use", - "Required": true, - "SortOrder": 10.0, - "IsNullable": false - }, + "Kind": "EnsembleDiversityMeasure", + "Components": [ { - "Name": "Suffix", - "Type": "String", - "Desc": "Output column: The suffix to append to the default column names", - "Aliases": [ - "ex" - ], - "Required": false, - "SortOrder": 101.0, - "IsNullable": false, - "Default": null + "Name": "DisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] }, { - "Name": "LabelPermutationSeed", - "Type": "Int", - "Desc": "If specified, determines the permutation seed for applying this featurizer to a multiclass problem.", - "Aliases": [ - "lps" - ], - "Required": false, - "SortOrder": 102.0, - "IsNullable": false, - "Default": 0 - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MultiDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "RegressionDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] } - ], - "InputKind": [ - "IFeaturizerInput", - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.TwoHeterogeneousModelCombiner", - "Desc": "Combines a TransformModel and a PredictorModel into a single PredictorModel.", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ + "Kind": "EnsembleFeatureSelector", + "Components": [ { - "Name": "TransformModel", - "Type": "TransformModel", - "Desc": "Transform model", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "AllFeatureSelector", + "Desc": null, + "FriendlyName": "All Feature Selector", + "Settings": [] }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model" + "Name": "RandomFeatureSelector", + "Desc": null, + "FriendlyName": "Random Feature Selector", + "Settings": [ + { + "Name": "FeaturesSelectionProportion", + "Type": "Float", + "Desc": "The proportion of features to be selected. The range is 0.0-1.0", + "Aliases": [ + "fp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.8 + } + ] } ] }, { - "Name": "Transforms.WordTokenizer", - "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.", - "FriendlyName": "Tokenize Text Transform", - "ShortName": "TokenizeTextTransform", - "Inputs": [ + "Kind": "EnsembleOutputCombiner", + "Components": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "TermSeparators", - "Type": "String", - "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", - "Aliases": [ - "sep" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s)", - "Aliases": [ - "col" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "Name": "Average", + "Desc": null, + "FriendlyName": "Average", + "Settings": [] }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "Median", + "Desc": null, + "FriendlyName": "Median", + "Settings": [] }, { - "Name": "TermSeparators", - "Type": "String", - "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", - "Aliases": [ - "sep" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "space" - } - ], - "Outputs": [ + "Name": "MultiAverage", + "Desc": null, + "FriendlyName": "Average", + "Settings": [ + { + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true + } + ] + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MultiMedian", + "Desc": null, + "FriendlyName": "Median", + "Settings": [ + { + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true + } + ] }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - } - ], - "Components": [ - { - "Kind": "AutoMlEngine", - "Components": [ - { - "Name": "Defaults", - "Desc": "AutoML engine that returns learners with default settings.", - "FriendlyName": "Defaults Engine", - "Settings": [] + "Name": "MultiStacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] }, { - "Name": "Rocket", - "Desc": "AutoML engine that consists of distinct, hierarchical stages of operation.", - "FriendlyName": "Rocket Engine", + "Name": "MultiVoting", + "Desc": null, + "FriendlyName": "Voting", "Settings": [ { - "Name": "TopKLearners", - "Type": "Int", - "Desc": "Number of learners to retain for second stage.", + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", "Aliases": [ - "topk" + "norm" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 2 - }, + "Default": true + } + ] + }, + { + "Name": "MultiWeightedAverage", + "Desc": null, + "FriendlyName": "Multi Weighted Average", + "Settings": [ { - "Name": "SecondRoundTrialsPerLearner", - "Type": "Int", - "Desc": "Number of trials for retained second stage learners.", + "Name": "WeightageName", + "Type": { + "Kind": "Enum", + "Values": [ + "AccuracyMicroAvg", + "AccuracyMacroAvg" + ] + }, + "Desc": "The metric type to be used to find the weights for each model", "Aliases": [ - "stage2num" + "wn" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5 + "Default": "AccuracyMicroAvg" }, { - "Name": "RandomInitialization", + "Name": "Normalize", "Type": "Bool", - "Desc": "Use random initialization only.", + "Desc": "Whether to normalize the output of base models before combining them", "Aliases": [ - "randinit" + "norm" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": false - }, + "Default": true + } + ] + }, + { + "Name": "RegressionStacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ { - "Name": "NumInitializationPipelines", - "Type": "Int", - "Desc": "Number of initilization pipelines, used for random initialization only.", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "numinitseeds" + "vp" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 20 + "Default": 0.3 } ] }, { - "Name": "UniformRandom", - "Desc": "AutoML engine using uniform random sampling.", - "FriendlyName": "Uniform Random Engine", + "Name": "Stacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "Voting", + "Desc": null, + "FriendlyName": "Voting", "Settings": [] - } - ] - }, - { - "Kind": "AutoMlStateBase", - "Components": [ + }, { - "Name": "AutoMlState", - "Desc": "State of an AutoML search and search space.", - "FriendlyName": "AutoML State", - "Aliases": [ - "automlst" - ], + "Name": "WeightedAverage", + "Desc": null, + "FriendlyName": "Stacking", "Settings": [ { - "Name": "Metric", + "Name": "WeightageName", "Type": { "Kind": "Enum", "Values": [ + "Accuracy", "Auc", - "AccuracyMicro", - "AccuracyMacro", - "L2", - "F1", - "AuPrc", - "TopKAccuracy", - "Rms", - "LossFn", - "RSquared", - "LogLoss", - "LogLossReduction", - "Ndcg", - "Dcg", - "PositivePrecision", - "PositiveRecall", - "NegativePrecision", - "NegativeRecall", - "DrAtK", - "DrAtPFpr", - "DrAtNumPos", - "NumAnomalies", - "ThreshAtK", - "ThreshAtP", - "ThreshAtNumPos", - "Nmi", - "AvgMinScore", - "Dbi" + "PosPrecision", + "PosRecall", + "NegPrecision", + "NegRecall" ] }, - "Desc": "Supported metric for evaluator.", + "Desc": "The metric type to be used to find the weights for each model", "Aliases": [ - "metric" + "wn" ], - "Required": true, - "SortOrder": 150.0, + "Required": false, + "SortOrder": 50.0, "IsNullable": false, "Default": "Auc" - }, + } + ] + } + ] + }, + { + "Kind": "EnsembleSubModelSelector", + "Components": [ + { + "Name": "AllSelector", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "AllSelectorMultiClass", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelector", + "Desc": null, + "FriendlyName": "Best Diverse Selector", + "Settings": [ { - "Name": "Engine", + "Name": "DiversityMetricType", "Type": { "Kind": "Component", - "ComponentKind": "AutoMlEngine" + "ComponentKind": "EnsembleDiversityMeasure" }, - "Desc": "AutoML engine (pipeline optimizer) that generates next candidates.", + "Desc": "The metric type to be used to find the diversity among base learners", "Aliases": [ - "engine" + "dm" ], - "Required": true, - "SortOrder": 150.0, - "IsNullable": false + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null }, { - "Name": "TrainerKind", - "Type": { - "Kind": "Enum", - "Values": [ - "SignatureBinaryClassifierTrainer", - "SignatureMultiClassClassifierTrainer", - "SignatureRankerTrainer", - "SignatureRegressorTrainer", - "SignatureMultiOutputRegressorTrainer", - "SignatureAnomalyDetectorTrainer", - "SignatureClusteringTrainer" - ] - }, - "Desc": "Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc.", + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "tk" + "lp" ], - "Required": true, - "SortOrder": 150.0, + "Required": false, + "SortOrder": 50.0, "IsNullable": false, - "Default": "SignatureBinaryClassifierTrainer" + "Default": 0.5 }, { - "Name": "TerminatorArgs", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "BestDiverseSelectorMultiClass", + "Desc": null, + "FriendlyName": "Best Diverse Selector", + "Settings": [ + { + "Name": "DiversityMetricType", "Type": { "Kind": "Component", - "ComponentKind": "SearchTerminator" + "ComponentKind": "EnsembleDiversityMeasure" }, - "Desc": "Arguments for creating terminator, which determines when to stop search.", + "Desc": "The metric type to be used to find the diversity among base learners", "Aliases": [ - "term" + "dm" ], - "Required": true, - "SortOrder": 150.0, - "IsNullable": false + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null }, { - "Name": "RequestedLearners", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Learner set to sweep over (if available).", + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "learners" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": null + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 } ] - } - ] - }, - { - "Kind": "CalibratorTrainer", - "Components": [ + }, { - "Name": "FixedPlattCalibrator", + "Name": "BestDiverseSelectorRegression", "Desc": null, - "FriendlyName": "Fixed Platt Calibrator", - "Aliases": [ - "FixedPlatt", - "FixedSigmoid" - ], + "FriendlyName": "Best Diverse Selector", "Settings": [ { - "Name": "Slope", + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", + "Aliases": [ + "dm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "a" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.5 }, { - "Name": "Offset", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "b" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.3 } ] }, { - "Name": "NaiveCalibrator", - "Desc": null, - "FriendlyName": "Naive Calibrator", - "Aliases": [ - "Naive" - ], - "Settings": [] - }, - { - "Name": "PavCalibrator", + "Name": "BestPerformanceRegressionSelector", "Desc": null, - "FriendlyName": "PAV Calibrator", - "Aliases": [ - "Pav" - ], - "Settings": [] - }, - { - "Name": "PlattCalibrator", - "Desc": "Platt calibration.", - "FriendlyName": "Platt Calibrator", - "Aliases": [ - "Platt", - "Sigmoid" - ], - "Settings": [] - } - ] - }, - { - "Kind": "ClassificationLossFunction", - "Components": [ - { - "Name": "ExpLoss", - "Desc": "Exponential loss.", - "FriendlyName": "Exponential Loss", + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "Beta", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "L1", + "L2", + "Rms", + "Loss", + "RSquared" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "L1" + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Beta (dilation)", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "beta" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 - } - ] - }, - { - "Name": "HingeLoss", - "Desc": "Hinge loss.", - "FriendlyName": "Hinge loss", - "Aliases": [ - "Hinge" - ], - "Settings": [ + "Default": 0.5 + }, { - "Name": "Margin", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Margin value", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "marg" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.3 } ] }, { - "Name": "LogLoss", - "Desc": "Log loss.", - "FriendlyName": "Log loss", - "Aliases": [ - "Logistic", - "CrossEntropy" - ], - "Settings": [] - }, - { - "Name": "SmoothedHingeLoss", - "Desc": "Smoothed Hinge loss.", - "FriendlyName": "Smoothed Hinge Loss", - "Aliases": [ - "SmoothedHinge" - ], + "Name": "BestPerformanceSelector", + "Desc": null, + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "SmoothingConst", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "PosPrecName", + "PosRecallName", + "NegPrecName", + "NegRecallName", + "Auc", + "LogLoss", + "LogLossReduction", + "F1", + "AuPrc" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Smoothing constant", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "smooth" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 - } - ] - } - ] - }, - { - "Kind": "EarlyStoppingCriterion", - "Components": [ - { - "Name": "GL", - "Desc": "Stop in case of loss of generality.", - "FriendlyName": "Loss of Generality (GL)", - "Settings": [ + "Default": 0.5 + }, { - "Name": "Threshold", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "th" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } + "Default": 0.3 } ] }, { - "Name": "LP", - "Desc": "Stops in case of low progress.", - "FriendlyName": "Low Progress (LP)", + "Name": "BestPerformanceSelectorMultiClass", + "Desc": null, + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "Threshold", - "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "AccuracyMicro", + "AccuracyMacro", + "LogLoss", + "LogLossReduction" + ] + }, + "Desc": "The metric type to be used to find the best performance", "Aliases": [ - "th" + "mn" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } + "Default": "AccuracyMicro" }, { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "w" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 - } - } - ] - }, - { - "Name": "PQ", - "Desc": "Stops in case of generality to progress ration exceeds threshold.", - "FriendlyName": "Generality to Progress Ratio (PQ)", - "Settings": [ + "Default": 0.5 + }, { - "Name": "Threshold", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "th" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } - }, + "Default": 0.3 + } + ] + } + ] + }, + { + "Kind": "EnsembleSubsetSelector", + "Components": [ + { + "Name": "AllInstanceSelector", + "Desc": null, + "FriendlyName": "All Instance Selector", + "Settings": [ { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "w" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 + "Default": { + "Name": "AllFeatureSelector" } } ] }, { - "Name": "TR", - "Desc": "Stop if validation score exceeds threshold value.", - "FriendlyName": "Tolerant (TR)", + "Name": "BootstrapSelector", + "Desc": null, + "FriendlyName": "Bootstrap Selector", "Settings": [ { - "Name": "Threshold", - "Type": "Float", - "Desc": "Tolerance threshold. (Non negative value)", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "th" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Min": 0.0 + "Default": { + "Name": "AllFeatureSelector" } } ] }, { - "Name": "UP", - "Desc": "Stops in case of consecutive loss in generality.", - "FriendlyName": "Consecutive Loss in Generality (UP)", + "Name": "RandomPartitionSelector", + "Desc": null, + "FriendlyName": "Random Partition Selector", "Settings": [ { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "w" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 + "Default": { + "Name": "AllFeatureSelector" } } ] diff --git a/test/Microsoft.ML.Tests/CSharpCodeGen.cs b/test/Microsoft.ML.Tests/CSharpCodeGen.cs index 5ee92d1c5e..678edac461 100644 --- a/test/Microsoft.ML.Tests/CSharpCodeGen.cs +++ b/test/Microsoft.ML.Tests/CSharpCodeGen.cs @@ -16,7 +16,7 @@ public CSharpCodeGen(ITestOutputHelper output) : base(output) { } - [Fact] + [Fact(Skip = "Execute this test if you want to regenerate CSharpApi file")] public void RegenerateCSharpApi() { var basePath = GetDataPath("../../src/Microsoft.ML/CSharpApi.cs"); From b934541e8442084315bf2e94c977dacc5ec055d7 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 19 Jun 2018 15:36:07 -0700 Subject: [PATCH 05/20] fix double stacking and double diverseselector in UI clean review comments --- src/Microsoft.ML.Ensemble/EnsembleUtils.cs | 2 +- .../EntryPoints/CreateEnsemble.cs | 4 ++-- .../EntryPoints/OutputCombiner.cs | 16 ++++++++----- .../EntryPoints/SubModelSelector.cs | 15 +++++++++--- .../OutputCombiners/BaseMultiAverager.cs | 2 +- .../OutputCombiners/BaseMultiCombiner.cs | 2 +- .../OutputCombiners/BaseStacking.cs | 4 ++-- .../OutputCombiners/IOutputCombiner.cs | 21 ++++++++++++++-- .../OutputCombiners/Median.cs | 2 +- .../OutputCombiners/MultiAverage.cs | 4 ++-- .../OutputCombiners/MultiMedian.cs | 4 ++-- .../OutputCombiners/MultiStacking.cs | 11 +++++---- .../OutputCombiners/MultiVoting.cs | 7 +++--- .../OutputCombiners/MultiWeightedAverage.cs | 8 +++---- .../OutputCombiners/RegressionStacking.cs | 4 ++-- .../OutputCombiners/Stacking.cs | 4 ++-- .../OutputCombiners/WeightedAverage.cs | 4 ++-- src/Microsoft.ML.Ensemble/PipelineEnsemble.cs | 12 +++++----- .../Selector/ISubModelSelector.cs | 15 ++++++++++-- .../SubModelSelector/AllSelectorMultiClass.cs | 2 +- .../SubModelSelector/BaseSubModelSelector.cs | 10 ++++---- .../BestDiverseSelectorBinary.cs | 6 ++--- .../BestDiverseSelectorMultiClass.cs | 8 +++---- .../BestDiverseSelectorRegression.cs | 4 ++-- .../BestPerformanceRegressionSelector.cs | 5 ++-- .../BestPerformanceSelector.cs | 4 ++-- .../BestPerformanceSelectorMultiClass.cs | 6 ++--- .../SubsetSelector/BaseSubsetSelector.cs | 2 +- .../SubsetSelector/BootstrapSelector.cs | 4 ++-- .../SubsetSelector/RandomPartitionSelector.cs | 2 +- .../Trainer/Binary/EnsembleTrainer.cs | 17 ++++++++++++- .../Trainer/EnsembleDistributionPredictor.cs | 2 +- .../Trainer/EnsemblePredictorBase.cs | 2 +- .../Trainer/EnsembleTrainerBase.cs | 24 ++++++------------- .../Multiclass/EnsembleMultiClassPredictor.cs | 2 +- .../MulticlassDataPartitionEnsembleTrainer.cs | 20 +++++++++++++--- .../Regression/RegressionEnsembleTrainer.cs | 16 ++++++++++++- 37 files changed, 177 insertions(+), 100 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/EnsembleUtils.cs b/src/Microsoft.ML.Ensemble/EnsembleUtils.cs index 088a7a9c96..d5134e48c9 100644 --- a/src/Microsoft.ML.Ensemble/EnsembleUtils.cs +++ b/src/Microsoft.ML.Ensemble/EnsembleUtils.cs @@ -27,7 +27,7 @@ public static RoleMappedData SelectFeatures(IHost host, RoleMappedData data, Bit if (card == type.VectorSize) return data; - // REVIEW shonk: This doesn't preserve metadata on the features column. Should it? + // REVIEW: This doesn't preserve metadata on the features column. Should it? var name = data.Schema.Feature.Name; var view = LambdaColumnMapper.Create( host, "FeatureSelector", data.Data, name, name, type, type, diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs index 96750a5406..8078cac1f1 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs @@ -296,7 +296,7 @@ public static CommonOutputs.AnomalyDetectionOutput CreateAnomalyPipelineEnsemble host.CheckValue(input, nameof(input)); host.CheckNonEmpty(input.Models, nameof(input.Models)); - IOutputCombiner combiner; + IRegressionOutputCombiner combiner; switch (input.ModelCombiner) { case ScoreCombiner.Median: @@ -385,7 +385,7 @@ public static void CheckSamePipeline(IHostEnvironment env, IChannel ch, using (var ms = new MemoryStream()) { - // REVIEW yaeld (tfinley): This can be done more efficiently by adding a custom type of repository that + // REVIEW: This can be done more efficiently by adding a custom type of repository that // doesn't actually save the data, but upon stream closure compares the results to the given repository // and then discards it. Currently, however, this cannot be done because ModelSaveContext does not use // an abstract class/interface, but rather the RepositoryWriter class. diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs index 0a3c0e7b0c..4f4107639a 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/OutputCombiner.cs @@ -23,20 +23,24 @@ namespace Microsoft.ML.Ensemble.EntryPoints { [TlcModule.Component(Name = Average.LoadName, FriendlyName = Average.UserName)] - public sealed class AverageFactory : ISupportOutputCombinerFactory + public sealed class AverageFactory : ISupportBinaryOutputCombinerFactory, ISupportRegressionOutputCombinerFactory { - IOutputCombiner IComponentFactory>.CreateComponent(IHostEnvironment env) => new Average(env); + public IRegressionOutputCombiner CreateComponent(IHostEnvironment env) => new Average(env); + + IBinaryOutputCombiner IComponentFactory.CreateComponent(IHostEnvironment env) => new Average(env); } [TlcModule.Component(Name = Median.LoadName, FriendlyName = Median.UserName)] - public sealed class MedianFactory : ISupportOutputCombinerFactory + public sealed class MedianFactory : ISupportBinaryOutputCombinerFactory, ISupportRegressionOutputCombinerFactory { - IOutputCombiner IComponentFactory>.CreateComponent(IHostEnvironment env) => new Median(env); + public IRegressionOutputCombiner CreateComponent(IHostEnvironment env) => new Median(env); + + IBinaryOutputCombiner IComponentFactory.CreateComponent(IHostEnvironment env) => new Median(env); } [TlcModule.Component(Name = Voting.LoadName, FriendlyName = Voting.UserName)] - public sealed class VotingFactory : ISupportOutputCombinerFactory + public sealed class VotingFactory : ISupportBinaryOutputCombinerFactory { - IOutputCombiner IComponentFactory>.CreateComponent(IHostEnvironment env) => new Voting(env); + IBinaryOutputCombiner IComponentFactory.CreateComponent(IHostEnvironment env) => new Voting(env); } } diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs index 2f3a42bc9c..a5c10e86f2 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs @@ -22,14 +22,23 @@ namespace Microsoft.ML.Ensemble.EntryPoints { [TlcModule.Component(Name = AllSelector.LoadName, FriendlyName = AllSelector.UserName)] - public sealed class AllSelectorFactory : ISupportSubModelSelectorFactory + public sealed class AllSelectorFactory : ISupportBinarySubModelSelectorFactory, ISupportRegressionSubModelSelectorFactory { public ISubModelSelector CreateComponent(IHostEnvironment env) => new AllSelector(env); + + IBinarySubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllSelector(env); + + IRegressionSubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllSelector(env); } [TlcModule.Component(Name = AllSelectorMultiClass.LoadName, FriendlyName = AllSelectorMultiClass.UserName)] - public sealed class AllSelectorMultiClassFactory : ISupportSubModelSelectorFactory> + public sealed class AllSelectorMultiClassFactory : ISupportMulticlassSubModelSelectorFactory { - public ISubModelSelector> CreateComponent(IHostEnvironment env) => new AllSelectorMultiClass(env); + public ISubModelSelector> CreateComponent(IHostEnvironment env) + { + throw new NotImplementedException(); + } + + IMulticlassSubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllSelectorMultiClass(env); } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs index 25377cb30f..64ec41d613 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiAverager.cs @@ -27,7 +27,7 @@ protected void CombineCore(ref VBuffer dst, VBuffer[] src, Singl Host.AssertNonEmpty(src); Host.Assert(weights == null || Utils.Size(weights) == Utils.Size(src)); - // REVIEW shonk: Should this be tolerant of NaNs? + // REVIEW: Should this be tolerant of NaNs? int len = GetClassCount(src); if (!TryNormalize(src)) { diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs index 8b21458280..1258313df1 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseMultiCombiner.cs @@ -11,7 +11,7 @@ namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { - public abstract class BaseMultiCombiner : IOutputCombiner> + public abstract class BaseMultiCombiner : IMultiClassOutputCombiner { protected readonly IHost Host; diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs index 96907347ee..7435958e62 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/BaseStacking.cs @@ -145,11 +145,11 @@ public void Train(List>> models, maps[i] = m.GetMapper, TOutput>(); } - // REVIEW shonk: Should implement this better.... + // REVIEW: Should implement this better.... var labels = new Single[100]; var features = new VBuffer[100]; int count = 0; - // REVIEW shonk: Should this include bad values or filter them? + // REVIEW: Should this include bad values or filter them? using (var cursor = new FloatLabelCursor(data, CursOpt.AllFeatures | CursOpt.AllLabels)) { TOutput[] predictions = new TOutput[maps.Length]; diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs index 836b560e8f..d6a83070ba 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/IOutputCombiner.cs @@ -42,11 +42,28 @@ public interface IBinaryOutputCombiner : IOutputCombiner { } - [TlcModule.ComponentKind("EnsembleOutputCombiner")] - public interface ISupportOutputCombinerFactory : IComponentFactory> + public interface IMultiClassOutputCombiner : IOutputCombiner> { } + + [TlcModule.ComponentKind("EnsembleMulticlassOutputCombiner")] + public interface ISupportMulticlassOutputCombinerFactory : IComponentFactory + { + } + + [TlcModule.ComponentKind("EnsembleBinaryOutputCombiner")] + public interface ISupportBinaryOutputCombinerFactory : IComponentFactory + { + + } + + [TlcModule.ComponentKind("EnsembleRegressionOutputCombiner")] + public interface ISupportRegressionOutputCombinerFactory : IComponentFactory + { + + } + public interface IWeightedAverager { string WeightageMetricName { get; } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs index e29df82e52..de8d950de4 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Median.cs @@ -76,7 +76,7 @@ public Combiner GetCombiner() private void CombineCore(ref Single dst, Single[] src, Single[] weights) { - // REVIEW shonk: This mutates "src". We need to ensure that the documentation of + // REVIEW: This mutates "src". We need to ensure that the documentation of // combiners makes it clear that combiners are allowed to do this. Note that "normalization" // in the multi-class case also mutates. _host.AssertNonEmpty(src); diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs index f657ab2342..c147f932f3 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiAverage.cs @@ -32,9 +32,9 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = Average.UserName)] - public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + public sealed class Arguments : ArgumentsBase, ISupportMulticlassOutputCombinerFactory { - public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiAverage(env, this); + public IMultiClassOutputCombiner CreateComponent(IHostEnvironment env) => new MultiAverage(env, this); } public MultiAverage(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs index 5a9452ab1e..c3e6869d69 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiMedian.cs @@ -35,9 +35,9 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = Median.UserName)] - public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + public sealed class Arguments : ArgumentsBase, ISupportMulticlassOutputCombinerFactory { - public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiMedian(env, this); + public IMultiClassOutputCombiner CreateComponent(IHostEnvironment env) => new MultiMedian(env, this); } public MultiMedian(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs index 623fb1694f..1207af2c5d 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs @@ -20,7 +20,7 @@ namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { using TVectorPredictor = IPredictorProducing>; - public sealed class MultiStacking : BaseStacking, SignatureMultiClassClassifierTrainer>, ICanSaveModel, IOutputCombiner> + public sealed class MultiStacking : BaseStacking, SignatureMultiClassClassifierTrainer>, ICanSaveModel, IMultiClassOutputCombiner { public const string LoadName = "MultiStacking"; public const string LoaderSignature = "MultiStackingCombiner"; @@ -36,12 +36,13 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = Stacking.UserName)] - public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + public sealed class Arguments : ArgumentsBase, ISupportMulticlassOutputCombinerFactory { - public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiStacking(env, this); + public IMultiClassOutputCombiner CreateComponent(IHostEnvironment env) => new MultiStacking(env, this); + public Arguments() { - // REVIEW tfinley: Kinda stupid. Perhaps we can have a better non-parametetric learner. + // REVIEW: Kinda stupid. Perhaps we can have a better non-parametetric learner. BasePredictorType = new SubComponent, SignatureMultiClassClassifierTrainer>( "OVA", "p=FastTreeBinaryClassification"); } @@ -75,7 +76,7 @@ protected override void FillFeatureBuffer(VBuffer[] src, ref VBuffer> + public sealed class Arguments : ArgumentsBase, ISupportMulticlassOutputCombinerFactory { public new bool Normalize = false; - public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiVoting(env, this); + + public IMultiClassOutputCombiner CreateComponent(IHostEnvironment env) => new MultiVoting(env, this); } public MultiVoting(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs index a24099577a..66bf23cfa2 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs @@ -35,7 +35,7 @@ public sealed class MultiWeightedAverage : BaseMultiAverager, IWeightedAverager, { public const string UserName = "Multi Weighted Average"; public const string LoadName = "MultiWeightedAverage"; - public const string LoaderSignature = "MultiWeightedAverageCombiner"; + public const string LoaderSignature = "MultiWeightedAverageComb"; private static VersionInfo GetVersionInfo() { @@ -48,13 +48,13 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] - public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory> + public sealed class Arguments : ArgumentsBase, ISupportMulticlassOutputCombinerFactory { + IMultiClassOutputCombiner IComponentFactory.CreateComponent(IHostEnvironment env) => new MultiWeightedAverage(env, this); + [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the weights for each model", ShortName = "wn", SortOrder = 50)] [TGUI(Label = "Metric Name", Description = "The weights are calculated according to the selected metric")] public MultiWeightageKind WeightageName = MultiWeightageKind.AccuracyMicroAvg; - - public IOutputCombiner> CreateComponent(IHostEnvironment env) => new MultiWeightedAverage(env, this); } private readonly MultiWeightageKind _weightageKind; diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs index 812b8af2eb..0bf435aaca 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs @@ -33,14 +33,14 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = Stacking.UserName)] - public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory + public sealed class Arguments : ArgumentsBase, ISupportRegressionOutputCombinerFactory { public Arguments() { BasePredictorType = new SubComponent, SignatureRegressorTrainer>("FastTreeRegression"); } - public IOutputCombiner CreateComponent(IHostEnvironment env) => new RegressionStacking(env, this); + public IRegressionOutputCombiner CreateComponent(IHostEnvironment env) => new RegressionStacking(env, this); } public RegressionStacking(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs index cf7a731177..afd6e3f958 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Stacking.cs @@ -33,14 +33,14 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] - public sealed class Arguments : ArgumentsBase, ISupportOutputCombinerFactory + public sealed class Arguments : ArgumentsBase, ISupportBinaryOutputCombinerFactory { public Arguments() { BasePredictorType = new SubComponent, SignatureBinaryClassifierTrainer>("FastTreeBinaryClassification"); } - public IOutputCombiner CreateComponent(IHostEnvironment env) => new Stacking(env, this); + public IBinaryOutputCombiner CreateComponent(IHostEnvironment env) => new Stacking(env, this); } public Stacking(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs index a6449febcc..4a824235a7 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/WeightedAverage.cs @@ -36,13 +36,13 @@ private static VersionInfo GetVersionInfo() } [TlcModule.Component(Name = LoadName, FriendlyName = Stacking.UserName)] - public sealed class Arguments: ISupportOutputCombinerFactory + public sealed class Arguments: ISupportBinaryOutputCombinerFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the weights for each model", ShortName = "wn", SortOrder = 50)] [TGUI(Label = "Weightage Name", Description = "The weights are calculated according to the selected metric")] public WeightageKind WeightageName = WeightageKind.Auc; - public IOutputCombiner CreateComponent(IHostEnvironment env) => new WeightedAverage(env, this); + public IBinaryOutputCombiner CreateComponent(IHostEnvironment env) => new WeightedAverage(env, this); } private WeightageKind _weightageKind; diff --git a/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs b/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs index 441d0afc02..54afc9ccd2 100644 --- a/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs +++ b/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs @@ -240,7 +240,7 @@ public override PredictionKind PredictionKind } } - public ImplOne(IHostEnvironment env, IPredictorModel[] predictors, IOutputCombiner combiner, string scoreColumnKind) + public ImplOne(IHostEnvironment env, IPredictorModel[] predictors, IRegressionOutputCombiner combiner, string scoreColumnKind) : base(env, predictors, combiner, LoaderSignature, scoreColumnKind) { } @@ -268,7 +268,7 @@ public override PredictionKind PredictionKind private readonly VectorType _scoreType; - public ImplVec(IHostEnvironment env, IPredictorModel[] predictors, IOutputCombiner> combiner) + public ImplVec(IHostEnvironment env, IPredictorModel[] predictors, IMultiClassOutputCombiner combiner) : base(env, predictors, combiner, LoaderSignature, MetadataUtils.Const.ScoreColumnKind.MultiClassClassification) { int classCount = CheckLabelColumn(Host, predictors, false); @@ -290,7 +290,7 @@ private sealed class ImplOneWithCalibrator : SchemaBindablePipelineEnsemble combiner) + public ImplOneWithCalibrator(IHostEnvironment env, IPredictorModel[] predictors, IBinaryOutputCombiner combiner) : base(env, predictors, combiner, LoaderSignature, MetadataUtils.Const.ScoreColumnKind.BinaryClassification) { Host.Assert(_scoreColumnKind == MetadataUtils.Const.ScoreColumnKind.BinaryClassification); @@ -515,18 +515,18 @@ public static SchemaBindablePipelineEnsembleBase Create(IHostEnvironment env, IP switch (scoreColumnKind) { case MetadataUtils.Const.ScoreColumnKind.BinaryClassification: - var binaryCombiner = combiner as IOutputCombiner; + var binaryCombiner = combiner as IBinaryOutputCombiner; if (binaryCombiner == null) throw env.Except("Combiner type incompatible with score column kind"); return new ImplOneWithCalibrator(env, predictors, binaryCombiner); case MetadataUtils.Const.ScoreColumnKind.Regression: case MetadataUtils.Const.ScoreColumnKind.AnomalyDetection: - var regressionCombiner = combiner as IOutputCombiner; + var regressionCombiner = combiner as IRegressionOutputCombiner; if (regressionCombiner == null) throw env.Except("Combiner type incompatible with score column kind"); return new ImplOne(env, predictors, regressionCombiner, scoreColumnKind); case MetadataUtils.Const.ScoreColumnKind.MultiClassClassification: - var vectorCombiner = combiner as IOutputCombiner>; + var vectorCombiner = combiner as IMultiClassOutputCombiner; if (vectorCombiner == null) throw env.Except("Combiner type incompatible with score column kind"); return new ImplVec(env, predictors, vectorCombiner); diff --git a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs index 1bffb977b1..96e9f5b886 100644 --- a/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/ISubModelSelector.cs @@ -33,9 +33,20 @@ public interface IMulticlassSubModelSelector : ISubModelSelector public delegate void SignatureEnsembleSubModelSelector(); - [TlcModule.ComponentKind("EnsembleSubModelSelector")] - public interface ISupportSubModelSelectorFactory : IComponentFactory> + [TlcModule.ComponentKind("EnsembleMulticlassSubModelSelector")] + public interface ISupportMulticlassSubModelSelectorFactory : IComponentFactory { } + [TlcModule.ComponentKind("EnsembleBinarySubModelSelector")] + public interface ISupportBinarySubModelSelectorFactory: IComponentFactory + { + + } + + [TlcModule.ComponentKind("EnsembleRegressionSubModelSelector")] + public interface ISupportRegressionSubModelSelectorFactory : IComponentFactory + { + + } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs index ab9509c637..39912427eb 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs @@ -13,7 +13,7 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { - public class AllSelectorMultiClass : BaseSubModelSelector> + public class AllSelectorMultiClass : BaseSubModelSelector>, IMulticlassSubModelSelector { public const string UserName = "All Selector"; public const string LoadName = "AllSelectorMultiClass"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs index 518250bbf5..2976214cfe 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseSubModelSelector.cs @@ -27,7 +27,7 @@ protected BaseSubModelSelector(IHostEnvironment env, string name) protected void Print(IChannel ch, IList>> models, string metricName) { - // REVIEW tfinley: The output format was faithfully reproduced from the original format, but it's unclear + // REVIEW: The output format was faithfully reproduced from the original format, but it's unclear // to me that this is right. Why have two bars in the header line, but only one bar in the results? ch.Info("List of models and the metrics after sorted"); ch.Info("| {0}(Sorted) || Name of Model |", metricName); @@ -81,20 +81,20 @@ public virtual void CalculateMetrics(FeatureSubsetModel + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] + public sealed class Arguments : DiverseSelectorArguments, ISupportBinarySubModelSelectorFactory { - public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorBinary(env, this); + public IBinarySubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorBinary(env, this); } public BestDiverseSelectorBinary(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs index dee3c27511..2b7a6ec1ae 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -18,7 +18,7 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { using TVectorPredictor = IPredictorProducing>; - public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector, IDiversityMeasure>> + public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector, IDiversityMeasure>>, IMulticlassSubModelSelector { public const string UserName = "Best Diverse Selector"; public const string LoadName = "BestDiverseSelectorMultiClass"; @@ -28,10 +28,10 @@ public override string DiversityMeasureLoadname get { return MultiDisagreementDiversityMeasure.LoadName; } } - [TlcModule.Component(Name = BestDiverseSelectorMultiClass.LoadName, FriendlyName = BestDiverseSelectorMultiClass.UserName)] - public sealed class Arguments : DiverseSelectorArguments, ISupportSubModelSelectorFactory> + [TlcModule.Component(Name = BestDiverseSelectorMultiClass.LoadName, FriendlyName = UserName)] + public sealed class Arguments : DiverseSelectorArguments, ISupportMulticlassSubModelSelectorFactory { - public ISubModelSelector> CreateComponent(IHostEnvironment env) => new BestDiverseSelectorMultiClass(env, this); + public IMulticlassSubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorMultiClass(env, this); } public BestDiverseSelectorMultiClass(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs index 1bd8a0b9ca..bbeb5f9a31 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs @@ -28,9 +28,9 @@ public override string DiversityMeasureLoadname } [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] - public sealed class Arguments : DiverseSelectorArguments, ISupportSubModelSelectorFactory + public sealed class Arguments : DiverseSelectorArguments, ISupportRegressionSubModelSelectorFactory { - public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorRegression(env, this); + public IRegressionSubModelSelector CreateComponent(IHostEnvironment env) => new BestDiverseSelectorRegression(env, this); } public BestDiverseSelectorRegression(IHostEnvironment env, Arguments args) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs index 273c8a682a..75201d8fb6 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs @@ -19,14 +19,15 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector public sealed class BestPerformanceRegressionSelector : BaseBestPerformanceSelector, IRegressionSubModelSelector { [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] - public sealed class Arguments : ArgumentsBase, ISupportSubModelSelectorFactory + public sealed class Arguments : ArgumentsBase, ISupportRegressionSubModelSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] [TGUI(Label = "Metric Name")] public RegressionEvaluator.Metrics MetricName = RegressionEvaluator.Metrics.L1; - public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestPerformanceRegressionSelector(env, this); + public IRegressionSubModelSelector CreateComponent(IHostEnvironment env) => new BestPerformanceRegressionSelector(env, this); } + public const string UserName = "Best Performance Selector"; public const string LoadName = "BestPerformanceRegressionSelector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs index fae99c4b00..e8257cd3a8 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs @@ -19,13 +19,13 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector public sealed class BestPerformanceSelector : BaseBestPerformanceSelector, IBinarySubModelSelector { [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] - public sealed class Arguments : ArgumentsBase, ISupportSubModelSelectorFactory + public sealed class Arguments : ArgumentsBase, ISupportBinarySubModelSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] [TGUI(Label = "Metric Name")] public BinaryClassifierEvaluator.Metrics MetricName = BinaryClassifierEvaluator.Metrics.Auc; - public ISubModelSelector CreateComponent(IHostEnvironment env) => new BestPerformanceSelector(env, this); + public IBinarySubModelSelector CreateComponent(IHostEnvironment env) => new BestPerformanceSelector(env, this); } public const string UserName = "Best Performance Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs index 0dd1d77acb..fe98ff50b2 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs @@ -16,16 +16,16 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { - public class BestPerformanceSelectorMultiClass : BaseBestPerformanceSelector> + public class BestPerformanceSelectorMultiClass : BaseBestPerformanceSelector>, IMulticlassSubModelSelector { [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] - public sealed class Arguments : ArgumentsBase,ISupportSubModelSelectorFactory> + public sealed class Arguments : ArgumentsBase, ISupportMulticlassSubModelSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The metric type to be used to find the best performance", ShortName = "mn", SortOrder = 50)] [TGUI(Label = "Metric Name")] public MultiClassClassifierEvaluator.Metrics MetricName = MultiClassClassifierEvaluator.Metrics.AccuracyMicro; - public ISubModelSelector> CreateComponent(IHostEnvironment env) => new BestPerformanceSelectorMultiClass(env, this); + IMulticlassSubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new BestPerformanceSelectorMultiClass(env, this); } public const string UserName = "Best Performance Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs index b3455370a1..d31439b79d 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BaseSubsetSelector.cs @@ -82,7 +82,7 @@ public IEnumerable GetBatches(IRandom rand) if (BatchSize > 0) { - // REVIEW shonk: How should we carve the data into batches? + // REVIEW: How should we carve the data into batches? ch.Warning("Batch support is temporarily disabled"); } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs index f226f9b866..25e7f8d64d 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs @@ -24,7 +24,7 @@ public sealed class BootstrapSelector : BaseSubsetSelector GetSubsets(Batch batch, IRandom rand) { for (int i = 0; i < Size; i++) { - // REVIEW tfinley: Consider ways to reintroduce "balanced" samples. + // REVIEW: Consider ways to reintroduce "balanced" samples. var viewTrain = new BootstrapSampleTransform(Host, new BootstrapSampleTransform.Arguments(), Data.Data); var dataTrain = RoleMappedData.Create(viewTrain, Data.Schema.GetColumnRoleNames()); yield return FeatureSelector.SelectFeatures(dataTrain, rand); diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs index 7d07935a1e..db11d0914c 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs @@ -41,7 +41,7 @@ public override IEnumerable GetSubsets(Batch batch, IRandom rand) args.Seed = (uint)rand.Next(); IDataTransform view = new GenerateNumberTransform(Host, args, Data.Data); - // REVIEW shonk: This won't be very efficient when _size is large. + // REVIEW: This won't be very efficient when _size is large. for (int i = 0; i < Size; i++) { var viewTrain = new RangeFilter(Host, new RangeFilter.Arguments() { Column = name, Min = (Double)i / Size, Max = (Double)(i + 1) / Size }, view); diff --git a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs index 0c14d10db9..c178fd9f6a 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs @@ -14,6 +14,7 @@ using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; using Microsoft.ML.Ensemble.EntryPoints; +using Microsoft.ML.Runtime.Internal.Internallearn; [assembly: LoadableClass(EnsembleTrainer.Summary, typeof(EnsembleTrainer), typeof(EnsembleTrainer.Arguments), new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer) }, @@ -36,6 +37,15 @@ public sealed class EnsembleTrainer : EnsembleTrainerBase, SignatureBinaryClassifierTrainer>("LinearSVM") }; @@ -44,9 +54,14 @@ public Arguments() } } + private readonly ISupportBinaryOutputCombinerFactory _outputCombiner; + public EnsembleTrainer(IHostEnvironment env, Arguments args) : base(args, env, LoadNameValue) { + SubModelSelector = args.SubModelSelectorType.CreateComponent(Host); + _outputCombiner = args.OutputCombiner; + Combiner = args.OutputCombiner.CreateComponent(Host); } public override PredictionKind PredictionKind @@ -66,7 +81,7 @@ public TScalarPredictor CombineModels(IEnumerable m.Weight).ToArray(); if (weights.All(w => w == 1)) weights = null; - var combiner = Args.OutputCombiner.CreateComponent(Host); + var combiner = _outputCombiner.CreateComponent(Host); var p = models.First().Value; TScalarPredictor predictor = null; diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs index e9cbc4fc3c..e41c5ef67a 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs @@ -153,7 +153,7 @@ public ValueMapper GetMapper() maps[i](ref tmp, ref predictions[i], ref probabilities[i]); }); - // REVIEW ansarim(nihejazi): Bug 3303: DistributionEnsemble - AveragedWeights are used only in one of the two PredictDistributions overloads + // REVIEW: DistributionEnsemble - AveragedWeights are used only in one of the two PredictDistributions overloads combine(ref dst, predictions, Weights); }; diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs index a577a39f1d..ff24d38996 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs @@ -155,7 +155,7 @@ public void SaveSummary(TextWriter writer, RoleMappedSchema schema) writer.WriteLine(";; Partition model {0}", i); writer.WriteLine(";; Weight={0}", (Weights != null ? Weights[i] : 1)); - // REVIEW ansarim: The featureName Collection names may vary for different base learners. + // REVIEW: The featureName Collection names may vary for different base learners. // How do we get the right collection for the base learners? if (Models[i].Predictor is ICanSaveSummary summaryModel) summaryModel.SaveSummary(writer, schema); diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs index 718709a87a..b418a7ccb1 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs @@ -52,14 +52,7 @@ public abstract class ArgumentsBase : LearnerInputBaseWithLabel [TGUI(Label = "Show Sub-Model Metrics")] public bool ShowMetrics; - [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] - [TGUI(Label = "Output combiner", Description = "Output combiner type")] - public ISupportOutputCombinerFactory OutputCombiner; - [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] - [TGUI(Label = "Sub-Model Selector(pruning) Type", - Description = "Algorithm to prune the base learners for selective Ensemble")] - public ISupportSubModelSelectorFactory SubModelSelectorType; [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1, Visibility =ArgumentAttribute.VisibilityType.CmdLineOnly)] public SubComponent>, TSig>[] BasePredictors; @@ -75,9 +68,8 @@ public abstract class ArgumentsBase : LearnerInputBaseWithLabel protected readonly ITrainer>[] Trainers; private readonly ISubsetSelector _subsetSelector; - private readonly ISubModelSelector _subModelSelector; - - protected readonly IOutputCombiner Combiner; + protected ISubModelSelector SubModelSelector; + protected IOutputCombiner Combiner; protected List>> Models; @@ -102,8 +94,6 @@ internal EnsembleTrainerBase(ArgumentsBase args, IHostEnvironment env, string na ch.Warning("The base predictor count is greater than models count. Some of the base predictors will be ignored."); _subsetSelector = Args.SamplingType.CreateComponent(Host); - _subModelSelector = Args.SubModelSelectorType.CreateComponent(Host); - Combiner = Args.OutputCombiner.CreateComponent(Host); Trainers = new ITrainer>[NumModels]; for (int i = 0; i < Trainers.Length; i++) @@ -147,10 +137,10 @@ private void TrainCore(IChannel ch, RoleMappedData data) // 1. Subset Selection var stackingTrainer = Combiner as IStackingTrainer; - //REVIEW ansarim: Implement stacking for Batch mode. + //REVIEW: Implement stacking for Batch mode. ch.CheckUserArg(stackingTrainer == null || Args.BatchSize <= 0, nameof(Args.BatchSize), "Stacking works only with Non-batch mode"); - var validationDataSetProportion = _subModelSelector.ValidationDatasetProportion; + var validationDataSetProportion = SubModelSelector.ValidationDatasetProportion; if (stackingTrainer != null) validationDataSetProportion = Math.Max(validationDataSetProportion, stackingTrainer.ValidationDatasetProportion); @@ -180,7 +170,7 @@ private void TrainCore(IChannel ch, RoleMappedData data) Trainers[(int)index].CreatePredictor(), subset.SelectedFeatures, null); - _subModelSelector.CalculateMetrics(model, _subsetSelector, subset, batch, needMetrics); + SubModelSelector.CalculateMetrics(model, _subsetSelector, subset, batch, needMetrics); models[(int)index] = model; } } @@ -197,7 +187,7 @@ private void TrainCore(IChannel ch, RoleMappedData data) if (Args.ShowMetrics) PrintMetrics(ch, modelsList); - modelsList = _subModelSelector.Prune(modelsList).ToList(); + modelsList = SubModelSelector.Prune(modelsList).ToList(); if (stackingTrainer != null) stackingTrainer.Train(modelsList, _subsetSelector.GetTestData(null, batch), Host); @@ -226,7 +216,7 @@ private bool EnsureMinimumFeaturesSelected(Subset subset) protected virtual void PrintMetrics(IChannel ch, List>> models) { - // REVIEW tfinley: The formatting of this method is bizarre and seemingly not even self-consistent + // REVIEW: The formatting of this method is bizarre and seemingly not even self-consistent // w.r.t. its usage of |. Is this intentional? if (models.Count == 0 || models[0].Metrics == null) return; diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs index 3c5561f585..f846d3f80a 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/EnsembleMultiClassPredictor.cs @@ -38,7 +38,7 @@ private static VersionInfo GetVersionInfo() public ColumnType OutputType { get { return _outputType; } } internal EnsembleMultiClassPredictor(IHostEnvironment env, FeatureSubsetModel[] models, - IOutputCombiner> combiner, Single[] weights = null) + IMultiClassOutputCombiner combiner, Single[] weights = null) : base(env, RegistrationName, models, combiner, weights) { InitializeMappers(out _mappers, out _inputType, out _outputType); diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs index c297526508..7914753a83 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs @@ -13,6 +13,7 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Internal.Internallearn; using Microsoft.ML.Runtime.Learners; [assembly: LoadableClass(MulticlassDataPartitionEnsembleTrainer.Summary, typeof(MulticlassDataPartitionEnsembleTrainer), @@ -29,7 +30,7 @@ namespace Microsoft.ML.Runtime.Ensemble /// public sealed class MulticlassDataPartitionEnsembleTrainer : EnsembleTrainerBase, EnsembleMultiClassPredictor, - IMulticlassSubModelSelector, IOutputCombiner>, SignatureMultiClassClassifierTrainer>, + IMulticlassSubModelSelector, IMultiClassOutputCombiner, SignatureMultiClassClassifierTrainer>, IModelCombiner, TVectorPredictor> { public const string LoadNameValue = "WeightedEnsembleMulticlass"; @@ -38,6 +39,14 @@ public sealed class MulticlassDataPartitionEnsembleTrainer : public sealed class Arguments : ArgumentsBase { + [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] + [TGUI(Label = "Sub-Model Selector(pruning) Type", Description = "Algorithm to prune the base learners for selective Ensemble")] + public ISupportMulticlassSubModelSelectorFactory SubModelSelectorType; + + [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] + [TGUI(Label = "Output combiner", Description = "Output combiner type")] + public ISupportMulticlassOutputCombinerFactory OutputCombiner; + public Arguments() { BasePredictors = new[] { new SubComponent, SignatureMultiClassClassifierTrainer>("MultiClassLogisticRegression") }; @@ -46,9 +55,14 @@ public Arguments() } } + private readonly ISupportMulticlassOutputCombinerFactory _outputCombiner; + public MulticlassDataPartitionEnsembleTrainer(IHostEnvironment env, Arguments args) : base(args, env, LoadNameValue) { + SubModelSelector = args.SubModelSelectorType.CreateComponent(Host); + _outputCombiner = args.OutputCombiner; + Combiner = args.OutputCombiner.CreateComponent(Host); } public override PredictionKind PredictionKind { get { return PredictionKind.MultiClassClassification; } } @@ -56,7 +70,7 @@ public MulticlassDataPartitionEnsembleTrainer(IHostEnvironment env, Arguments ar public override EnsembleMultiClassPredictor CreatePredictor() { var combiner = Combiner; - return new EnsembleMultiClassPredictor(Host, CreateModels(), combiner); + return new EnsembleMultiClassPredictor(Host, CreateModels(), combiner as IMultiClassOutputCombiner); } public TVectorPredictor CombineModels(IEnumerable> models) @@ -67,7 +81,7 @@ public TVectorPredictor CombineModels(IEnumerable new FeatureSubsetModel(k.Value)).ToArray(), - Args.OutputCombiner.CreateComponent(Host), weights); + _outputCombiner.CreateComponent(Host), weights); return predictor; } diff --git a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs index 10349dbbd7..7fff93169b 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs @@ -13,6 +13,7 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; +using Microsoft.ML.Runtime.Internal.Internallearn; using Microsoft.ML.Runtime.Learners; [assembly: LoadableClass(typeof(RegressionEnsembleTrainer), typeof(RegressionEnsembleTrainer.Arguments), @@ -32,6 +33,14 @@ public sealed class RegressionEnsembleTrainer : EnsembleTrainerBase, SignatureRegressorTrainer>("OnlineGradientDescent") }; @@ -40,9 +49,14 @@ public Arguments() } } + private readonly ISupportRegressionOutputCombinerFactory _outputCombiner; + public RegressionEnsembleTrainer(IHostEnvironment env, Arguments args) : base(args, env, LoadNameValue) { + SubModelSelector = args.SubModelSelectorType.CreateComponent(Host); + _outputCombiner = args.OutputCombiner; + Combiner = args.OutputCombiner.CreateComponent(Host); } public override PredictionKind PredictionKind @@ -60,7 +74,7 @@ public TScalarPredictor CombineModels(IEnumerable m.Weight).ToArray(); if (weights.All(w => w == 1)) weights = null; - var combiner = Args.OutputCombiner.CreateComponent(Host); + var combiner = _outputCombiner.CreateComponent(Host); var p = models.First().Value; var predictor = new EnsemblePredictor(Host, p.PredictionKind, From 7139d4a23822014b934266648040cb071e467424 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 19 Jun 2018 16:06:32 -0700 Subject: [PATCH 06/20] some cleaning, remove one subcomponent update ep-list and CSharpApi.cs --- .../EntryPoints/Ensemble.cs | 6 +- .../SubModelSelector/BaseDiverseSelector.cs | 7 +- .../BestDiverseSelectorBinary.cs | 6 +- .../BestDiverseSelectorMultiClass.cs | 7 +- .../BestDiverseSelectorRegression.cs | 6 +- .../Trainer/EnsembleDistributionPredictor.cs | 5 +- .../Trainer/EnsemblePredictor.cs | 5 +- .../Regression/RegressionEnsembleTrainer.cs | 2 - .../Standard/OlsLinearRegression.cs | 2 - src/Microsoft.ML/CSharpApi.cs | 1005 ++++++------ .../Common/EntryPoints/core_ep-list.tsv | 6 +- .../Common/EntryPoints/core_manifest.json | 1458 +++++++++-------- .../UnitTests/TestEntryPoints.cs | 20 +- test/Microsoft.ML.TestFramework/Datasets.cs | 2 +- 14 files changed, 1304 insertions(+), 1233 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs index cb72aac2ae..d00a835192 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs @@ -13,7 +13,7 @@ namespace Microsoft.ML.Ensemble.EntryPoints { public static class Ensemble { - [TlcModule.EntryPoint(Name = "Trainer.BinaryEnsemble", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.BinaryEnsemble", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHostEnvironment env, EnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); @@ -26,7 +26,7 @@ public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHos () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); } - [TlcModule.EntryPoint(Name = "Trainer.ClassificationEnsemble", Desc = "Train multiclass ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.ClassificationEnsemble", Desc = "Train multiclass ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassEnsemble(IHostEnvironment env, MulticlassDataPartitionEnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); @@ -39,7 +39,7 @@ public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassEnsem () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); } - [TlcModule.EntryPoint(Name = "Trainer.RegressionEnsemble", Desc = "Train regression ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.RegressionEnsemble", Desc = "Train regression ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.RegressionOutput CreateRegressionEnsemble(IHostEnvironment env, RegressionEnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs index 790b262806..fbf3c02839 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseDiverseSelector.cs @@ -27,7 +27,7 @@ public abstract class DiverseSelectorArguments : ArgumentsBase private readonly ISupportDiversityMeasureFactory _diversityMetricType; private ConcurrentDictionary>, TOutput[]> _predictions; - public abstract string DiversityMeasureLoadname { get; } + protected abstract ISupportDiversityMeasureFactory DefaultDiversityMetricType { get; } protected internal BaseDiverseSelector(IHostEnvironment env, DiverseSelectorArguments args, string name) : base(args, env, name) @@ -39,10 +39,7 @@ protected internal BaseDiverseSelector(IHostEnvironment env, DiverseSelectorArgu protected IDiversityMeasure CreateDiversityMetric() { if (_diversityMetricType == null) - { - var sc = new SubComponent(DiversityMeasureLoadname); - return sc.CreateInstance(Host); - } + return DefaultDiversityMetricType.CreateComponent(Host); return _diversityMetricType.CreateComponent(Host); } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs index 2b60719e8c..1ab403dd07 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; @@ -22,10 +23,7 @@ public sealed class BestDiverseSelectorBinary : BaseDiverseSelector DefaultDiversityMetricType => new DisagreementDiversityFactory(); [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportBinarySubModelSelectorFactory diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs index 2b7a6ec1ae..c41fb89f7d 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble.Selector; @@ -22,11 +23,7 @@ public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector> DefaultDiversityMetricType => new MultinDisagreementDiversityFactory(); [TlcModule.Component(Name = BestDiverseSelectorMultiClass.LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportMulticlassSubModelSelectorFactory diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs index bbeb5f9a31..aaa970c471 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Ensemble.Selector; using Microsoft.ML.Runtime.Ensemble.Selector.DiversityMeasure; @@ -22,10 +23,7 @@ public sealed class BestDiverseSelectorRegression : BaseDiverseSelector DefaultDiversityMetricType => new RegressionDisagreementDiversityFactory(); [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportRegressionSubModelSelectorFactory diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs index e41c5ef67a..139cf9207d 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs @@ -15,8 +15,7 @@ // These are for deserialization from a model repository. [assembly: LoadableClass(typeof(EnsembleDistributionPredictor), null, typeof(SignatureLoadModel), - EnsembleDistributionPredictor.UserName, - EnsembleDistributionPredictor.LoaderSignature)] + EnsembleDistributionPredictor.UserName, EnsembleDistributionPredictor.LoaderSignature)] namespace Microsoft.ML.Runtime.Ensemble { @@ -122,7 +121,7 @@ protected override void SaveCore(ModelSaveContext ctx) ctx.Writer.Write((int)PredictionKind); } - + public ValueMapper GetMapper() { diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs index 3e51ee8a90..514257a643 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs @@ -11,8 +11,9 @@ using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.EntryPoints; -[assembly: LoadableClass(typeof(EnsemblePredictor), null, typeof(SignatureLoadModel), - EnsemblePredictor.UserName, EnsemblePredictor.LoaderSignature)] +[assembly: LoadableClass(typeof(EnsemblePredictor), null, typeof(SignatureLoadModel), EnsemblePredictor.UserName, + EnsemblePredictor.LoaderSignature)] + [assembly: EntryPointModule(typeof(EnsemblePredictor))] namespace Microsoft.ML.Runtime.Ensemble diff --git a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs index 7fff93169b..29643dc971 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs @@ -12,9 +12,7 @@ using Microsoft.ML.Runtime.Ensemble; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; -using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; using Microsoft.ML.Runtime.Internal.Internallearn; -using Microsoft.ML.Runtime.Learners; [assembly: LoadableClass(typeof(RegressionEnsembleTrainer), typeof(RegressionEnsembleTrainer.Arguments), new[] { typeof(SignatureRegressorTrainer), typeof(SignatureTrainer) }, diff --git a/src/Microsoft.ML.StandardLearners/Standard/OlsLinearRegression.cs b/src/Microsoft.ML.StandardLearners/Standard/OlsLinearRegression.cs index d927ba0a43..7ea557159e 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/OlsLinearRegression.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/OlsLinearRegression.cs @@ -28,8 +28,6 @@ "OLS Linear Regression Executor", OlsLinearRegressionPredictor.LoaderSignature)] -[assembly: LoadableClass(typeof(void), typeof(OlsLinearRegressionTrainer), null, typeof(SignatureEntryPointModule), OlsLinearRegressionTrainer.LoadNameValue)] - namespace Microsoft.ML.Runtime.Learners { public sealed class OlsLinearRegressionTrainer : TrainerBase diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index f4b46427d5..1f94063f02 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -478,52 +478,40 @@ public void Add(Microsoft.ML.Models.TrainTestEvaluator input, Microsoft.ML.Model _jsonNodes.Add(Serialize("Models.TrainTestEvaluator", input, output)); } - public Microsoft.ML.Trainer.BinaryEnsemble.Output Add(Microsoft.ML.Trainer.BinaryEnsemble input) - { - var output = new Microsoft.ML.Trainer.BinaryEnsemble.Output(); - Add(input, output); - return output; - } - - public void Add(Microsoft.ML.Trainer.BinaryEnsemble input, Microsoft.ML.Trainer.BinaryEnsemble.Output output) - { - _jsonNodes.Add(Serialize("Trainer.BinaryEnsemble", input, output)); - } - - public Microsoft.ML.Trainer.ClassificationEnsemble.Output Add(Microsoft.ML.Trainer.ClassificationEnsemble input) + public Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input) { - var output = new Microsoft.ML.Trainer.ClassificationEnsemble.Output(); + var output = new Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output(); Add(input, output); return output; } - public void Add(Microsoft.ML.Trainer.ClassificationEnsemble input, Microsoft.ML.Trainer.ClassificationEnsemble.Output output) + public void Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input, Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output output) { - _jsonNodes.Add(Serialize("Trainer.ClassificationEnsemble", input, output)); + _jsonNodes.Add(Serialize("Trainers.AveragedPerceptronBinaryClassifier", input, output)); } - public Microsoft.ML.Trainer.RegressionEnsemble.Output Add(Microsoft.ML.Trainer.RegressionEnsemble input) + public Microsoft.ML.Trainers.BinaryEnsemble.Output Add(Microsoft.ML.Trainers.BinaryEnsemble input) { - var output = new Microsoft.ML.Trainer.RegressionEnsemble.Output(); + var output = new Microsoft.ML.Trainers.BinaryEnsemble.Output(); Add(input, output); return output; } - public void Add(Microsoft.ML.Trainer.RegressionEnsemble input, Microsoft.ML.Trainer.RegressionEnsemble.Output output) + public void Add(Microsoft.ML.Trainers.BinaryEnsemble input, Microsoft.ML.Trainers.BinaryEnsemble.Output output) { - _jsonNodes.Add(Serialize("Trainer.RegressionEnsemble", input, output)); + _jsonNodes.Add(Serialize("Trainers.BinaryEnsemble", input, output)); } - public Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input) + public Microsoft.ML.Trainers.ClassificationEnsemble.Output Add(Microsoft.ML.Trainers.ClassificationEnsemble input) { - var output = new Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output(); + var output = new Microsoft.ML.Trainers.ClassificationEnsemble.Output(); Add(input, output); return output; } - public void Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input, Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier.Output output) + public void Add(Microsoft.ML.Trainers.ClassificationEnsemble input, Microsoft.ML.Trainers.ClassificationEnsemble.Output output) { - _jsonNodes.Add(Serialize("Trainers.AveragedPerceptronBinaryClassifier", input, output)); + _jsonNodes.Add(Serialize("Trainers.ClassificationEnsemble", input, output)); } public Microsoft.ML.Trainers.FastForestBinaryClassifier.Output Add(Microsoft.ML.Trainers.FastForestBinaryClassifier input) @@ -718,6 +706,18 @@ public void Add(Microsoft.ML.Trainers.PoissonRegressor input, Microsoft.ML.Train _jsonNodes.Add(Serialize("Trainers.PoissonRegressor", input, output)); } + public Microsoft.ML.Trainers.RegressionEnsemble.Output Add(Microsoft.ML.Trainers.RegressionEnsemble input) + { + var output = new Microsoft.ML.Trainers.RegressionEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.RegressionEnsemble input, Microsoft.ML.Trainers.RegressionEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Trainers.RegressionEnsemble", input, output)); + } + public Microsoft.ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier.Output Add(Microsoft.ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier input) { var output = new Microsoft.ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier.Output(); @@ -4005,53 +4005,108 @@ public sealed class Output } } - namespace Trainer + namespace Trainers { /// - /// Train binary ensemble. + /// Train a Average perceptron. /// - public sealed partial class BinaryEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. + /// Loss Function /// - public int? NumModels { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public ClassificationLossFunction LossFunction { get; set; } = new HingeLossClassificationLossFunction(); /// - /// Batch size + /// The calibrator kind to apply to the predictor. Specify null for no calibration /// - public int BatchSize { get; set; } = -1; + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); /// - /// Sampling Type + /// The maximum number of examples to use when training the calibrator /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); + public int MaxCalibrationExamples { get; set; } = 1000000; /// - /// All the base learners will run asynchronously if the value is true + /// Learning rate /// - public bool TrainParallel { get; set; } = false; + [TlcModule.SweepableDiscreteParamAttribute("LearningRate", new object[]{0.01f, 0.1f, 0.5f, 1f})] + public float LearningRate { get; set; } = 1f; /// - /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set + /// Decrease learning rate /// - public bool ShowMetrics { get; set; } = false; + [TlcModule.SweepableDiscreteParamAttribute("DecreaseLearningRate", new object[]{false, true})] + public bool DecreaseLearningRate { get; set; } = false; /// - /// Output combiner + /// Number of examples after which weights will be reset to the current average /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleOutputCombiner(); + public long? ResetWeightsAfterXExamples { get; set; } /// - /// Algorithm to prune the base learners for selective Ensemble + /// Instead of updating averaged weights on every example, only update when loss is nonzero /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleSubModelSelector(); + public bool DoLazyUpdates { get; set; } = true; + + /// + /// L2 Regularization Weight + /// + [TlcModule.SweepableFloatParamAttribute("L2RegularizerWeight", 0f, 0.5f)] + public float L2RegularizerWeight { get; set; } + + /// + /// Extra weight given to more recent updates + /// + public float RecencyGain { get; set; } + + /// + /// Whether Recency Gain is multiplicative (vs. additive) + /// + public bool RecencyGainMulti { get; set; } = false; + + /// + /// Do averaging? + /// + public bool Averaged { get; set; } = true; + + /// + /// The inexactness tolerance for averaging + /// + public float AveragedTolerance { get; set; } = 0.01f; + + /// + /// Number of iterations + /// + [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize:10, isLogScale:true)] + public int NumIterations { get; set; } = 1; + + /// + /// Initial Weights and bias, comma-separated + /// + public string InitialWeights { get; set; } + + /// + /// Init weights diameter + /// + [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] + public float InitWtsDiameter { get; set; } + + /// + /// Whether to shuffle for each training iteration + /// + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; + + /// + /// Size of cache when trained in Scope + /// + public int StreamingCacheSize { get; set; } = 1000000; /// /// Column to use for labels @@ -4095,18 +4150,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(BinaryEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(AveragedPerceptronBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new BinaryEnsemblePipelineStep(output); + return new AveragedPerceptronBinaryClassifierPipelineStep(output); } - private class BinaryEnsemblePipelineStep : ILearningPipelinePredictorStep + private class AveragedPerceptronBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public BinaryEnsemblePipelineStep(Output output) + public AveragedPerceptronBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -4116,16 +4171,28 @@ public BinaryEnsemblePipelineStep(Output output) } } - namespace Trainer + namespace Trainers { /// - /// Train multiclass ensemble. + /// Train binary ensemble. /// - public sealed partial class ClassificationEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class BinaryEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { + /// + /// Algorithm to prune the base learners for selective Ensemble + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleBinarySubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleBinarySubModelSelector(); + + /// + /// Output combiner + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleBinaryOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleBinaryOutputCombiner(); + /// /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. /// @@ -4152,18 +4219,6 @@ public sealed partial class ClassificationEnsemble : Microsoft.ML.Runtime.EntryP /// public bool ShowMetrics { get; set; } = false; - /// - /// Output combiner - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleOutputCombiner OutputCombiner { get; set; } = new MultiMedianEnsembleOutputCombiner(); - - /// - /// Algorithm to prune the base learners for selective Ensemble - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorMultiClassEnsembleSubModelSelector(); - /// /// Column to use for labels /// @@ -4190,7 +4245,7 @@ public sealed partial class ClassificationEnsemble : Microsoft.ML.Runtime.EntryP public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -4206,18 +4261,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ClassificationEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(BinaryEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new ClassificationEnsemblePipelineStep(output); + return new BinaryEnsemblePipelineStep(output); } - private class ClassificationEnsemblePipelineStep : ILearningPipelinePredictorStep + private class BinaryEnsemblePipelineStep : ILearningPipelinePredictorStep { - public ClassificationEnsemblePipelineStep(Output output) + public BinaryEnsemblePipelineStep(Output output) { Model = output.PredictorModel; } @@ -4227,16 +4282,28 @@ public ClassificationEnsemblePipelineStep(Output output) } } - namespace Trainer + namespace Trainers { /// - /// Train regression ensemble. + /// Train multiclass ensemble. /// - public sealed partial class RegressionEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ClassificationEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { + /// + /// Algorithm to prune the base learners for selective Ensemble + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleMulticlassSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorMultiClassEnsembleMulticlassSubModelSelector(); + + /// + /// Output combiner + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleMulticlassOutputCombiner OutputCombiner { get; set; } = new MultiMedianEnsembleMulticlassOutputCombiner(); + /// /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. /// @@ -4263,18 +4330,6 @@ public sealed partial class RegressionEnsemble : Microsoft.ML.Runtime.EntryPoint /// public bool ShowMetrics { get; set; } = false; - /// - /// Output combiner - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleOutputCombiner(); - - /// - /// Algorithm to prune the base learners for selective Ensemble - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleSubModelSelector(); - /// /// Column to use for labels /// @@ -4301,7 +4356,7 @@ public sealed partial class RegressionEnsemble : Microsoft.ML.Runtime.EntryPoint public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -4317,18 +4372,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(RegressionEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ClassificationEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new RegressionEnsemblePipelineStep(output); + return new ClassificationEnsemblePipelineStep(output); } - private class RegressionEnsemblePipelineStep : ILearningPipelinePredictorStep + private class ClassificationEnsemblePipelineStep : ILearningPipelinePredictorStep { - public RegressionEnsemblePipelineStep(Output output) + public ClassificationEnsemblePipelineStep(Output output) { Model = output.PredictorModel; } @@ -4340,19 +4395,25 @@ public RegressionEnsemblePipelineStep(Output output) namespace Trainers { + public enum Bundle : byte + { + None = 0, + AggregateLowPopulation = 1, + Adjacent = 2 + } + /// - /// Train a Average perceptron. + /// Uses a random forest learner to perform binary classification. /// - public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Loss Function + /// Upper bound on absolute value of single tree output /// - [JsonConverter(typeof(ComponentSerializer))] - public ClassificationLossFunction LossFunction { get; set; } = new HingeLossClassificationLossFunction(); + public double MaxTreeOutput { get; set; } = 100d; /// /// The calibrator kind to apply to the predictor. Specify null for no calibration @@ -4366,227 +4427,55 @@ public sealed partial class AveragedPerceptronBinaryClassifier : Microsoft.ML.Ru public int MaxCalibrationExamples { get; set; } = 1000000; /// - /// Learning rate + /// Number of labels to be sampled from each leaf to make the distribtuion /// - [TlcModule.SweepableDiscreteParamAttribute("LearningRate", new object[]{0.01f, 0.1f, 0.5f, 1f})] - public float LearningRate { get; set; } = 1f; + public int QuantileSampleCount { get; set; } = 100; /// - /// Decrease learning rate + /// Allows to choose Parallel FastTree Learning Algorithm /// - [TlcModule.SweepableDiscreteParamAttribute("DecreaseLearningRate", new object[]{false, true})] - public bool DecreaseLearningRate { get; set; } = false; + [JsonConverter(typeof(ComponentSerializer))] + public ParallelTraining ParallelTrainer { get; set; } = new SingleParallelTraining(); /// - /// Number of examples after which weights will be reset to the current average + /// The number of threads to use /// - public long? ResetWeightsAfterXExamples { get; set; } + public int? NumThreads { get; set; } /// - /// Instead of updating averaged weights on every example, only update when loss is nonzero + /// The seed of the random number generator /// - public bool DoLazyUpdates { get; set; } = true; + public int RngSeed { get; set; } = 123; /// - /// L2 Regularization Weight + /// The seed of the active feature selection /// - [TlcModule.SweepableFloatParamAttribute("L2RegularizerWeight", 0f, 0.5f)] - public float L2RegularizerWeight { get; set; } + public int FeatureSelectSeed { get; set; } = 123; /// - /// Extra weight given to more recent updates + /// The entropy (regularization) coefficient between 0 and 1 /// - public float RecencyGain { get; set; } + public double EntropyCoefficient { get; set; } /// - /// Whether Recency Gain is multiplicative (vs. additive) + /// The number of histograms in the pool (between 2 and numLeaves) /// - public bool RecencyGainMulti { get; set; } = false; + public int HistogramPoolSize { get; set; } = -1; /// - /// Do averaging? + /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose /// - public bool Averaged { get; set; } = true; + public bool? DiskTranspose { get; set; } /// - /// The inexactness tolerance for averaging + /// Whether to collectivize features during dataset preparation to speed up training /// - public float AveragedTolerance { get; set; } = 0.01f; + public bool FeatureFlocks { get; set; } = true; /// - /// Number of iterations + /// Whether to do split based on multiple categorical feature values. /// - [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize:10, isLogScale:true)] - public int NumIterations { get; set; } = 1; - - /// - /// Initial Weights and bias, comma-separated - /// - public string InitialWeights { get; set; } - - /// - /// Init weights diameter - /// - [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] - public float InitWtsDiameter { get; set; } - - /// - /// Whether to shuffle for each training iteration - /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; - - /// - /// Size of cache when trained in Scope - /// - public int StreamingCacheSize { get; set; } = 1000000; - - /// - /// Column to use for labels - /// - public string LabelColumn { get; set; } = "Label"; - - /// - /// The data to be used for training - /// - public Var TrainingData { get; set; } = new Var(); - - /// - /// Column to use for features - /// - public string FeatureColumn { get; set; } = "Features"; - - /// - /// Normalize option for the feature column - /// - public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; - - /// - /// Whether learner should cache input training data - /// - public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - - - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput - { - /// - /// The trained model - /// - public Var PredictorModel { get; set; } = new Var(); - - } - public Var GetInputData() => TrainingData; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(AveragedPerceptronBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - TrainingData = dataStep.Data; - } - Output output = experiment.Add(this); - return new AveragedPerceptronBinaryClassifierPipelineStep(output); - } - - private class AveragedPerceptronBinaryClassifierPipelineStep : ILearningPipelinePredictorStep - { - public AveragedPerceptronBinaryClassifierPipelineStep(Output output) - { - Model = output.PredictorModel; - } - - public Var Model { get; } - } - } - } - - namespace Trainers - { - public enum Bundle : byte - { - None = 0, - AggregateLowPopulation = 1, - Adjacent = 2 - } - - - /// - /// Uses a random forest learner to perform binary classification. - /// - public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem - { - - - /// - /// Upper bound on absolute value of single tree output - /// - public double MaxTreeOutput { get; set; } = 100d; - - /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration - /// - [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); - - /// - /// The maximum number of examples to use when training the calibrator - /// - public int MaxCalibrationExamples { get; set; } = 1000000; - - /// - /// Number of labels to be sampled from each leaf to make the distribtuion - /// - public int QuantileSampleCount { get; set; } = 100; - - /// - /// Allows to choose Parallel FastTree Learning Algorithm - /// - [JsonConverter(typeof(ComponentSerializer))] - public ParallelTraining ParallelTrainer { get; set; } = new SingleParallelTraining(); - - /// - /// The number of threads to use - /// - public int? NumThreads { get; set; } - - /// - /// The seed of the random number generator - /// - public int RngSeed { get; set; } = 123; - - /// - /// The seed of the active feature selection - /// - public int FeatureSelectSeed { get; set; } = 123; - - /// - /// The entropy (regularization) coefficient between 0 and 1 - /// - public double EntropyCoefficient { get; set; } - - /// - /// The number of histograms in the pool (between 2 and numLeaves) - /// - public int HistogramPoolSize { get; set; } = -1; - - /// - /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose - /// - public bool? DiskTranspose { get; set; } - - /// - /// Whether to collectivize features during dataset preparation to speed up training - /// - public bool FeatureFlocks { get; set; } = true; - - /// - /// Whether to do split based on multiple categorical feature values. - /// - public bool CategoricalSplit { get; set; } = false; + public bool CategoricalSplit { get; set; } = false; /// /// Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features. @@ -8004,79 +7893,49 @@ namespace Trainers { /// - /// Train an SDCA binary model. + /// Train regression ensemble. /// - public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class RegressionEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Loss Function + /// Algorithm to prune the base learners for selective Ensemble /// [JsonConverter(typeof(ComponentSerializer))] - public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); - - /// - /// Apply weight to the positive class, for imbalanced data - /// - public float PositiveInstanceWeight { get; set; } = 1f; + public EnsembleRegressionSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleRegressionSubModelSelector(); /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration + /// Output combiner /// [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); - - /// - /// The maximum number of examples to use when training the calibrator - /// - public int MaxCalibrationExamples { get; set; } = 1000000; - - /// - /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. - /// - [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] - public float? L2Const { get; set; } - - /// - /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. - /// - [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] - public float? L1Threshold { get; set; } + public EnsembleRegressionOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleRegressionOutputCombiner(); /// - /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. - /// - public int? NumThreads { get; set; } - - /// - /// The tolerance for the ratio between duality gap and primal loss for convergence checking. + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. /// - [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] - public float ConvergenceTolerance { get; set; } = 0.1f; + public int? NumModels { get; set; } /// - /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. + /// Batch size /// - [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] - public int? MaxIterations { get; set; } + public int BatchSize { get; set; } = -1; /// - /// Shuffle data every epoch? + /// Sampling Type /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); /// - /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. + /// All the base learners will run asynchronously if the value is true /// - public int? CheckFrequency { get; set; } + public bool TrainParallel { get; set; } = false; /// - /// The learning rate for adjusting bias from being regularized. + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set /// - [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] - public float BiasLearningRate { get; set; } + public bool ShowMetrics { get; set; } = false; /// /// Column to use for labels @@ -8104,7 +7963,7 @@ public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Mic public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -8120,18 +7979,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(RegressionEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new StochasticDualCoordinateAscentBinaryClassifierPipelineStep(output); + return new RegressionEnsemblePipelineStep(output); } - private class StochasticDualCoordinateAscentBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class RegressionEnsemblePipelineStep : ILearningPipelinePredictorStep { - public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output) + public RegressionEnsemblePipelineStep(Output output) { Model = output.PredictorModel; } @@ -8145,9 +8004,9 @@ namespace Trainers { /// - /// Train an SDCA multi class model + /// Train an SDCA binary model. /// - public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { @@ -8158,19 +8017,160 @@ public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); /// - /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. + /// Apply weight to the positive class, for imbalanced data /// - [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] - public float? L2Const { get; set; } + public float PositiveInstanceWeight { get; set; } = 1f; /// - /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. + /// The calibrator kind to apply to the predictor. Specify null for no calibration /// - [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] - public float? L1Threshold { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); /// - /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// The maximum number of examples to use when training the calibrator + /// + public int MaxCalibrationExamples { get; set; } = 1000000; + + /// + /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. + /// + [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] + public float? L2Const { get; set; } + + /// + /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. + /// + [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] + public float? L1Threshold { get; set; } + + /// + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// + public int? NumThreads { get; set; } + + /// + /// The tolerance for the ratio between duality gap and primal loss for convergence checking. + /// + [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] + public float ConvergenceTolerance { get; set; } = 0.1f; + + /// + /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. + /// + [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] + public int? MaxIterations { get; set; } + + /// + /// Shuffle data every epoch? + /// + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; + + /// + /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. + /// + public int? CheckFrequency { get; set; } + + /// + /// The learning rate for adjusting bias from being regularized. + /// + [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] + public float BiasLearningRate { get; set; } + + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new StochasticDualCoordinateAscentBinaryClassifierPipelineStep(output); + } + + private class StochasticDualCoordinateAscentBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + { + public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + + namespace Trainers + { + + /// + /// Train an SDCA multi class model + /// + public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Loss Function + /// + [JsonConverter(typeof(ComponentSerializer))] + public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); + + /// + /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. + /// + [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] + public float? L2Const { get; set; } + + /// + /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. + /// + [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] + public float? L1Threshold { get; set; } + + /// + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. /// public int? NumThreads { get; set; } @@ -14620,209 +14620,260 @@ public sealed class UPEarlyStoppingCriterion : EarlyStoppingCriterion internal override string ComponentName => "UP"; } - public abstract class EnsembleDiversityMeasure : ComponentKind {} + public abstract class EnsembleBinaryOutputCombiner : ComponentKind {} - public sealed class DisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + public sealed class AverageEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner { - internal override string ComponentName => "DisagreementDiversityMeasure"; + internal override string ComponentName => "Average"; } - public sealed class MultiDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + public sealed class MedianEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner { - internal override string ComponentName => "MultiDisagreementDiversityMeasure"; + internal override string ComponentName => "Median"; } - public sealed class RegressionDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + public sealed class StackingEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner { - internal override string ComponentName => "RegressionDisagreementDiversityMeasure"; + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "Stacking"; } - public abstract class EnsembleFeatureSelector : ComponentKind {} + public sealed class VotingEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner + { + internal override string ComponentName => "Voting"; + } - public sealed class AllFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector + public enum WeightageKind { - internal override string ComponentName => "AllFeatureSelector"; + Accuracy = 0, + Auc = 1, + PosPrecision = 2, + PosRecall = 3, + NegPrecision = 4, + NegRecall = 5 } - public sealed class RandomFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector + public sealed class WeightedAverageEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner { /// - /// The proportion of features to be selected. The range is 0.0-1.0 + /// The metric type to be used to find the weights for each model /// - public float FeaturesSelectionProportion { get; set; } = 0.8f; + public WeightageKind WeightageName { get; set; } = WeightageKind.Auc; - internal override string ComponentName => "RandomFeatureSelector"; + internal override string ComponentName => "WeightedAverage"; } - public abstract class EnsembleOutputCombiner : ComponentKind {} + public abstract class EnsembleBinarySubModelSelector : ComponentKind {} - public sealed class AverageEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class AllSelectorEnsembleBinarySubModelSelector : EnsembleBinarySubModelSelector { - internal override string ComponentName => "Average"; + internal override string ComponentName => "AllSelector"; } - public sealed class MedianEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class BestDiverseSelectorEnsembleBinarySubModelSelector : EnsembleBinarySubModelSelector { - internal override string ComponentName => "Median"; - } - + /// + /// The metric type to be used to find the diversity among base learners + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; - public sealed class MultiAverageEnsembleOutputCombiner : EnsembleOutputCombiner - { /// - /// Whether to normalize the output of base models before combining them + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// - public bool Normalize { get; set; } = true; + public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "MultiAverage"; + internal override string ComponentName => "BestDiverseSelector"; + } + + public enum BinaryClassifierEvaluatorMetrics + { + Accuracy = 0, + PosPrecName = 1, + PosRecallName = 2, + NegPrecName = 3, + NegRecallName = 4, + Auc = 5, + LogLoss = 6, + LogLossReduction = 7, + F1 = 8, + AuPrc = 9 } - public sealed class MultiMedianEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class BestPerformanceSelectorEnsembleBinarySubModelSelector : EnsembleBinarySubModelSelector { /// - /// Whether to normalize the output of base models before combining them + /// The metric type to be used to find the best performance /// - public bool Normalize { get; set; } = true; - - internal override string ComponentName => "MultiMedian"; - } - + public BinaryClassifierEvaluatorMetrics MetricName { get; set; } = BinaryClassifierEvaluatorMetrics.Auc; + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; - public sealed class MultiStackingEnsembleOutputCombiner : EnsembleOutputCombiner - { /// /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "MultiStacking"; + internal override string ComponentName => "BestPerformanceSelector"; } + public abstract class EnsembleDiversityMeasure : ComponentKind {} - public sealed class MultiVotingEnsembleOutputCombiner : EnsembleOutputCombiner + + public sealed class DisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure { - /// - /// Whether to normalize the output of base models before combining them - /// - public bool Normalize { get; set; } = true; + internal override string ComponentName => "DisagreementDiversityMeasure"; + } - internal override string ComponentName => "MultiVoting"; + + + public sealed class MultiDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + { + internal override string ComponentName => "MultiDisagreementDiversityMeasure"; } - public enum MultiWeightageKind + + + public sealed class RegressionDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure { - AccuracyMicroAvg = 0, - AccuracyMacroAvg = 1 + internal override string ComponentName => "RegressionDisagreementDiversityMeasure"; } + public abstract class EnsembleFeatureSelector : ComponentKind {} + - public sealed class MultiWeightedAverageEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class AllFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector + { + internal override string ComponentName => "AllFeatureSelector"; + } + + + + public sealed class RandomFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector { /// - /// The metric type to be used to find the weights for each model + /// The proportion of features to be selected. The range is 0.0-1.0 /// - public MultiWeightageKind WeightageName { get; set; } = MultiWeightageKind.AccuracyMicroAvg; + public float FeaturesSelectionProportion { get; set; } = 0.8f; + + internal override string ComponentName => "RandomFeatureSelector"; + } + public abstract class EnsembleMulticlassOutputCombiner : ComponentKind {} + + + + public sealed class MultiAverageEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner + { /// /// Whether to normalize the output of base models before combining them /// public bool Normalize { get; set; } = true; - internal override string ComponentName => "MultiWeightedAverage"; + internal override string ComponentName => "MultiAverage"; } - public sealed class RegressionStackingEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class MultiMedianEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner { /// - /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// Whether to normalize the output of base models before combining them /// - public float ValidationDatasetProportion { get; set; } = 0.3f; + public bool Normalize { get; set; } = true; - internal override string ComponentName => "RegressionStacking"; + internal override string ComponentName => "MultiMedian"; } - public sealed class StackingEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class MultiStackingEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner { /// /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "Stacking"; + internal override string ComponentName => "MultiStacking"; } - public sealed class VotingEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class MultiVotingEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner { - internal override string ComponentName => "Voting"; + /// + /// Whether to normalize the output of base models before combining them + /// + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiVoting"; } - public enum WeightageKind + public enum MultiWeightageKind { - Accuracy = 0, - Auc = 1, - PosPrecision = 2, - PosRecall = 3, - NegPrecision = 4, - NegRecall = 5 + AccuracyMicroAvg = 0, + AccuracyMacroAvg = 1 } - public sealed class WeightedAverageEnsembleOutputCombiner : EnsembleOutputCombiner + public sealed class MultiWeightedAverageEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner { /// /// The metric type to be used to find the weights for each model /// - public WeightageKind WeightageName { get; set; } = WeightageKind.Auc; - - internal override string ComponentName => "WeightedAverage"; - } - - public abstract class EnsembleSubModelSelector : ComponentKind {} - + public MultiWeightageKind WeightageName { get; set; } = MultiWeightageKind.AccuracyMicroAvg; + /// + /// Whether to normalize the output of base models before combining them + /// + public bool Normalize { get; set; } = true; - public sealed class AllSelectorEnsembleSubModelSelector : EnsembleSubModelSelector - { - internal override string ComponentName => "AllSelector"; + internal override string ComponentName => "MultiWeightedAverage"; } + public abstract class EnsembleMulticlassSubModelSelector : ComponentKind {} + - public sealed class AllSelectorMultiClassEnsembleSubModelSelector : EnsembleSubModelSelector + public sealed class AllSelectorMultiClassEnsembleMulticlassSubModelSelector : EnsembleMulticlassSubModelSelector { internal override string ComponentName => "AllSelectorMultiClass"; } - public sealed class BestDiverseSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + public sealed class BestDiverseSelectorMultiClassEnsembleMulticlassSubModelSelector : EnsembleMulticlassSubModelSelector { /// /// The metric type to be used to find the diversity among base learners @@ -14840,18 +14891,25 @@ public sealed class BestDiverseSelectorEnsembleSubModelSelector : EnsembleSubMod /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "BestDiverseSelector"; + internal override string ComponentName => "BestDiverseSelectorMultiClass"; + } + + public enum MultiClassClassifierEvaluatorMetrics + { + AccuracyMicro = 0, + AccuracyMacro = 1, + LogLoss = 2, + LogLossReduction = 3 } - public sealed class BestDiverseSelectorMultiClassEnsembleSubModelSelector : EnsembleSubModelSelector + public sealed class BestPerformanceSelectorMultiClassEnsembleMulticlassSubModelSelector : EnsembleMulticlassSubModelSelector { /// - /// The metric type to be used to find the diversity among base learners + /// The metric type to be used to find the best performance /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleDiversityMeasure DiversityMetricType { get; set; } + public MultiClassClassifierEvaluatorMetrics MetricName { get; set; } = MultiClassClassifierEvaluatorMetrics.AccuracyMicro; /// /// The proportion of best base learners to be selected. The range is 0.0-1.0 @@ -14863,85 +14921,55 @@ public sealed class BestDiverseSelectorMultiClassEnsembleSubModelSelector : Ense /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "BestDiverseSelectorMultiClass"; + internal override string ComponentName => "BestPerformanceSelectorMultiClass"; } + public abstract class EnsembleRegressionOutputCombiner : ComponentKind {} - public sealed class BestDiverseSelectorRegressionEnsembleSubModelSelector : EnsembleSubModelSelector + + public sealed class AverageEnsembleRegressionOutputCombiner : EnsembleRegressionOutputCombiner { - /// - /// The metric type to be used to find the diversity among base learners - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleDiversityMeasure DiversityMetricType { get; set; } + internal override string ComponentName => "Average"; + } - /// - /// The proportion of best base learners to be selected. The range is 0.0-1.0 - /// - public float LearnersSelectionProportion { get; set; } = 0.5f; - /// - /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set - /// - public float ValidationDatasetProportion { get; set; } = 0.3f; - - internal override string ComponentName => "BestDiverseSelectorRegression"; - } - public enum RegressionEvaluatorMetrics + public sealed class MedianEnsembleRegressionOutputCombiner : EnsembleRegressionOutputCombiner { - L1 = 0, - L2 = 1, - Rms = 2, - Loss = 3, - RSquared = 4 + internal override string ComponentName => "Median"; } - public sealed class BestPerformanceRegressionSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + public sealed class RegressionStackingEnsembleRegressionOutputCombiner : EnsembleRegressionOutputCombiner { - /// - /// The metric type to be used to find the best performance - /// - public RegressionEvaluatorMetrics MetricName { get; set; } = RegressionEvaluatorMetrics.L1; - - /// - /// The proportion of best base learners to be selected. The range is 0.0-1.0 - /// - public float LearnersSelectionProportion { get; set; } = 0.5f; - /// /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "BestPerformanceRegressionSelector"; + internal override string ComponentName => "RegressionStacking"; } - public enum BinaryClassifierEvaluatorMetrics + public abstract class EnsembleRegressionSubModelSelector : ComponentKind {} + + + + public sealed class AllSelectorEnsembleRegressionSubModelSelector : EnsembleRegressionSubModelSelector { - Accuracy = 0, - PosPrecName = 1, - PosRecallName = 2, - NegPrecName = 3, - NegRecallName = 4, - Auc = 5, - LogLoss = 6, - LogLossReduction = 7, - F1 = 8, - AuPrc = 9 + internal override string ComponentName => "AllSelector"; } - public sealed class BestPerformanceSelectorEnsembleSubModelSelector : EnsembleSubModelSelector + public sealed class BestDiverseSelectorRegressionEnsembleRegressionSubModelSelector : EnsembleRegressionSubModelSelector { /// - /// The metric type to be used to find the best performance + /// The metric type to be used to find the diversity among base learners /// - public BinaryClassifierEvaluatorMetrics MetricName { get; set; } = BinaryClassifierEvaluatorMetrics.Auc; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } /// /// The proportion of best base learners to be selected. The range is 0.0-1.0 @@ -14953,25 +14981,26 @@ public sealed class BestPerformanceSelectorEnsembleSubModelSelector : EnsembleSu /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "BestPerformanceSelector"; + internal override string ComponentName => "BestDiverseSelectorRegression"; } - public enum MultiClassClassifierEvaluatorMetrics + public enum RegressionEvaluatorMetrics { - AccuracyMicro = 0, - AccuracyMacro = 1, - LogLoss = 2, - LogLossReduction = 3 + L1 = 0, + L2 = 1, + Rms = 2, + Loss = 3, + RSquared = 4 } - public sealed class BestPerformanceSelectorMultiClassEnsembleSubModelSelector : EnsembleSubModelSelector + public sealed class BestPerformanceRegressionSelectorEnsembleRegressionSubModelSelector : EnsembleRegressionSubModelSelector { /// /// The metric type to be used to find the best performance /// - public MultiClassClassifierEvaluatorMetrics MetricName { get; set; } = MultiClassClassifierEvaluatorMetrics.AccuracyMicro; + public RegressionEvaluatorMetrics MetricName { get; set; } = RegressionEvaluatorMetrics.L1; /// /// The proportion of best base learners to be selected. The range is 0.0-1.0 @@ -14983,7 +15012,7 @@ public sealed class BestPerformanceSelectorMultiClassEnsembleSubModelSelector : /// public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "BestPerformanceSelectorMultiClass"; + internal override string ComponentName => "BestPerformanceRegressionSelector"; } public abstract class EnsembleSubsetSelector : ComponentKind {} diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 10678ace1b..7866586a79 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -36,10 +36,9 @@ Models.Summarizer Summarize a linear regression predictor. Microsoft.ML.Runtime. Models.SweepResultExtractor Extracts the sweep result. Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro ExtractSweepResult Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+ResultInput Microsoft.ML.Runtime.EntryPoints.PipelineSweeperMacro+Output Models.TrainTestBinaryEvaluator Train test for binary classification Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro TrainTestBinary Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output] Models.TrainTestEvaluator General train test for any supported evaluator Microsoft.ML.Runtime.EntryPoints.TrainTestMacro TrainTest Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output] -Trainer.BinaryEnsemble Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput -Trainer.ClassificationEnsemble Train multiclass ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateMultiClassEnsemble Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput -Trainer.RegressionEnsemble Train regression ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateRegressionEnsemble Microsoft.ML.Runtime.Ensemble.RegressionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.AveragedPerceptronBinaryClassifier Train a Average perceptron. Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer TrainBinary Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput +Trainers.BinaryEnsemble Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput +Trainers.ClassificationEnsemble Train multiclass ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateMultiClassEnsemble Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput Trainers.FastForestBinaryClassifier Uses a random forest learner to perform binary classification. Microsoft.ML.Runtime.FastTree.FastForest TrainBinary Microsoft.ML.Runtime.FastTree.FastForestClassification+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.FastForestRegressor Trains a random forest to fit target values using least-squares. Microsoft.ML.Runtime.FastTree.FastForest TrainRegression Microsoft.ML.Runtime.FastTree.FastForestRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.FastTreeBinaryClassifier Uses a logit-boost boosted tree learner to perform binary classification. Microsoft.ML.Runtime.FastTree.FastTree TrainBinary Microsoft.ML.Runtime.FastTree.FastTreeBinaryClassificationTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput @@ -56,6 +55,7 @@ Trainers.NaiveBayesClassifier Train a MultiClassNaiveBayesTrainer. Microsoft.ML. Trainers.OnlineGradientDescentRegressor Train a Online gradient descent perceptron. Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer TrainRegression Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+AnomalyDetectionOutput Trainers.PoissonRegressor Train an Poisson regression model. Microsoft.ML.Runtime.Learners.PoissonRegression TrainRegression Microsoft.ML.Runtime.Learners.PoissonRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput +Trainers.RegressionEnsemble Train regression ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateRegressionEnsemble Microsoft.ML.Runtime.Ensemble.RegressionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.StochasticDualCoordinateAscentBinaryClassifier Train an SDCA binary model. Microsoft.ML.Runtime.Learners.Sdca TrainBinary Microsoft.ML.Runtime.Learners.LinearClassificationTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.StochasticDualCoordinateAscentClassifier Train an SDCA multi class model Microsoft.ML.Runtime.Learners.Sdca TrainMultiClass Microsoft.ML.Runtime.Learners.SdcaMultiClassTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput Trainers.StochasticDualCoordinateAscentRegressor Train an SDCA regression model Microsoft.ML.Runtime.Learners.Sdca TrainRegression Microsoft.ML.Runtime.Learners.SdcaRegressionTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index dc1692802b..e4e6dcd7db 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -3718,10 +3718,10 @@ ] }, { - "Name": "Trainer.BinaryEnsemble", - "Desc": "Train binary ensemble.", - "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", - "ShortName": null, + "Name": "Trainers.AveragedPerceptronBinaryClassifier", + "Desc": "Train a Average perceptron.", + "FriendlyName": "Averaged Perceptron", + "ShortName": "ap", "Inputs": [ { "Name": "TrainingData", @@ -3734,28 +3734,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "SamplingType", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleSubsetSelector" - }, - "Desc": "Sampling Type", - "Aliases": [ - "st" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": { - "Name": "BootstrapSelector", - "Settings": { - "FeatureSelector": { - "Name": "AllFeatureSelector" - } - } - } - }, { "Name": "FeatureColumn", "Type": "String", @@ -3768,18 +3746,6 @@ "IsNullable": false, "Default": "Features" }, - { - "Name": "NumModels", - "Type": "Int", - "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", - "Aliases": [ - "nm" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": true, - "Default": null - }, { "Name": "LabelColumn", "Type": "String", @@ -3792,40 +3758,6 @@ "IsNullable": false, "Default": "Label" }, - { - "Name": "SubModelSelectorType", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleSubModelSelector" - }, - "Desc": "Algorithm to prune the base learners for selective Ensemble", - "Aliases": [ - "pt" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": { - "Name": "AllSelector" - } - }, - { - "Name": "OutputCombiner", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleOutputCombiner" - }, - "Desc": "Output combiner", - "Aliases": [ - "oc" - ], - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": { - "Name": "Median" - } - }, { "Name": "NormalizeFeatures", "Type": { @@ -3867,242 +3799,253 @@ "Default": "Auto" }, { - "Name": "TrainParallel", - "Type": "Bool", - "Desc": "All the base learners will run asynchronously if the value is true", + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "ClassificationLossFunction" + }, + "Desc": "Loss Function", "Aliases": [ - "tp" + "loss" ], "Required": false, - "SortOrder": 106.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "HingeLoss" + } }, { - "Name": "BatchSize", - "Type": "Int", - "Desc": "Batch size", + "Name": "LearningRate", + "Type": "Float", + "Desc": "Learning rate", "Aliases": [ - "bs" + "lr" ], "Required": false, - "SortOrder": 107.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": -1 + "Default": 1.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.1, + 0.5, + 1.0 + ] + } }, { - "Name": "ShowMetrics", + "Name": "DecreaseLearningRate", "Type": "Bool", - "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", + "Desc": "Decrease learning rate", "Aliases": [ - "sm" + "decreaselr" ], "Required": false, - "SortOrder": 108.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": false - } - ], - "Outputs": [ + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IBinaryClassificationOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainer.ClassificationEnsemble", - "Desc": "Train multiclass ensemble.", - "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", - "ShortName": null, - "Inputs": [ + "Name": "L2RegularizerWeight", + "Type": "Float", + "Desc": "L2 Regularization Weight", + "Aliases": [ + "reg" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 0.5 + } + }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "NumIterations", + "Type": "Int", + "Desc": "Number of iterations", "Aliases": [ - "data" + "iter" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 100, + "StepSize": 10.0, + "IsLogScale": true + } }, { - "Name": "SamplingType", + "Name": "InitWtsDiameter", + "Type": "Float", + "Desc": "Init weights diameter", + "Aliases": [ + "initwts" + ], + "Required": false, + "SortOrder": 140.0, + "IsNullable": false, + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 + } + }, + { + "Name": "Calibrator", "Type": { "Kind": "Component", - "ComponentKind": "EnsembleSubsetSelector" + "ComponentKind": "CalibratorTrainer" }, - "Desc": "Sampling Type", - "Aliases": [ - "st" - ], + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, "Default": { - "Name": "BootstrapSelector", - "Settings": { - "FeatureSelector": { - "Name": "AllFeatureSelector" - } - } + "Name": "PlattCalibrator" } }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", - "Aliases": [ - "feat" - ], + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": 1000000 }, { - "Name": "NumModels", + "Name": "ResetWeightsAfterXExamples", "Type": "Int", - "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", + "Desc": "Number of examples after which weights will be reset to the current average", "Aliases": [ - "nm" + "numreset" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "DoLazyUpdates", + "Type": "Bool", + "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", "Aliases": [ - "lab" + "lazy" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": true }, { - "Name": "SubModelSelectorType", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleSubModelSelector" - }, - "Desc": "Algorithm to prune the base learners for selective Ensemble", + "Name": "RecencyGain", + "Type": "Float", + "Desc": "Extra weight given to more recent updates", "Aliases": [ - "pt" + "rg" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "AllSelectorMultiClass" - } + "Default": 0.0 }, { - "Name": "OutputCombiner", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleOutputCombiner" - }, - "Desc": "Output combiner", + "Name": "RecencyGainMulti", + "Type": "Bool", + "Desc": "Whether Recency Gain is multiplicative (vs. additive)", "Aliases": [ - "oc" + "rgm" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "MultiMedian" - } + "Default": false }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "Averaged", + "Type": "Bool", + "Desc": "Do averaging?", "Aliases": [ - "norm" + "avg" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": true }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "AveragedTolerance", + "Type": "Float", + "Desc": "The inexactness tolerance for averaging", "Aliases": [ - "cache" + "avgtol" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.01 }, { - "Name": "TrainParallel", - "Type": "Bool", - "Desc": "All the base learners will run asynchronously if the value is true", + "Name": "InitialWeights", + "Type": "String", + "Desc": "Initial Weights and bias, comma-separated", "Aliases": [ - "tp" + "initweights" ], "Required": false, - "SortOrder": 106.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": null }, { - "Name": "BatchSize", - "Type": "Int", - "Desc": "Batch size", + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Whether to shuffle for each training iteration", "Aliases": [ - "bs" + "shuf" ], "Required": false, - "SortOrder": 107.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "ShowMetrics", - "Type": "Bool", - "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", + "Name": "StreamingCacheSize", + "Type": "Int", + "Desc": "Size of cache when trained in Scope", "Aliases": [ - "sm" + "cache" ], "Required": false, - "SortOrder": 108.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1000000 } ], "Outputs": [ @@ -4117,13 +4060,13 @@ "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainer.RegressionEnsemble", - "Desc": "Train regression ensemble.", + "Name": "Trainers.BinaryEnsemble", + "Desc": "Train binary ensemble.", "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", "ShortName": null, "Inputs": [ @@ -4200,7 +4143,7 @@ "Name": "SubModelSelectorType", "Type": { "Kind": "Component", - "ComponentKind": "EnsembleSubModelSelector" + "ComponentKind": "EnsembleBinarySubModelSelector" }, "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ @@ -4217,7 +4160,7 @@ "Name": "OutputCombiner", "Type": { "Kind": "Component", - "ComponentKind": "EnsembleOutputCombiner" + "ComponentKind": "EnsembleBinaryOutputCombiner" }, "Desc": "Output combiner", "Aliases": [ @@ -4319,15 +4262,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.AveragedPerceptronBinaryClassifier", - "Desc": "Train a Average perceptron.", - "FriendlyName": "Averaged Perceptron", - "ShortName": "ap", + "Name": "Trainers.ClassificationEnsemble", + "Desc": "Train multiclass ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, "Inputs": [ { "Name": "TrainingData", @@ -4340,6 +4283,28 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", + "Aliases": [ + "st" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } + } + }, { "Name": "FeatureColumn", "Type": "String", @@ -4352,6 +4317,18 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "NumModels", + "Type": "Int", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", + "Aliases": [ + "nm" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": true, + "Default": null + }, { "Name": "LabelColumn", "Type": "String", @@ -4364,6 +4341,40 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleMulticlassSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", + "Aliases": [ + "pt" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": { + "Name": "AllSelectorMultiClass" + } + }, + { + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleMulticlassOutputCombiner" + }, + "Desc": "Output combiner", + "Aliases": [ + "oc" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": { + "Name": "MultiMedian" + } + }, { "Name": "NormalizeFeatures", "Type": { @@ -4405,253 +4416,40 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "ClassificationLossFunction" - }, - "Desc": "Loss Function", - "Aliases": [ - "loss" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": { - "Name": "HingeLoss" - } - }, - { - "Name": "LearningRate", - "Type": "Float", - "Desc": "Learning rate", - "Aliases": [ - "lr" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.01, - 0.1, - 0.5, - 1.0 - ] - } - }, - { - "Name": "DecreaseLearningRate", - "Type": "Bool", - "Desc": "Decrease learning rate", - "Aliases": [ - "decreaselr" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } - }, - { - "Name": "L2RegularizerWeight", - "Type": "Float", - "Desc": "L2 Regularization Weight", - "Aliases": [ - "reg" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 0.5 - } - }, - { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Number of iterations", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 100, - "StepSize": 10.0, - "IsLogScale": true - } - }, - { - "Name": "InitWtsDiameter", - "Type": "Float", - "Desc": "Init weights diameter", - "Aliases": [ - "initwts" - ], - "Required": false, - "SortOrder": 140.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } - }, - { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } - }, - { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 - }, - { - "Name": "ResetWeightsAfterXExamples", - "Type": "Int", - "Desc": "Number of examples after which weights will be reset to the current average", - "Aliases": [ - "numreset" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "DoLazyUpdates", - "Type": "Bool", - "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", - "Aliases": [ - "lazy" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "RecencyGain", - "Type": "Float", - "Desc": "Extra weight given to more recent updates", - "Aliases": [ - "rg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "RecencyGainMulti", - "Type": "Bool", - "Desc": "Whether Recency Gain is multiplicative (vs. additive)", - "Aliases": [ - "rgm" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "Averaged", + "Name": "TrainParallel", "Type": "Bool", - "Desc": "Do averaging?", - "Aliases": [ - "avg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "AveragedTolerance", - "Type": "Float", - "Desc": "The inexactness tolerance for averaging", - "Aliases": [ - "avgtol" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.01 - }, - { - "Name": "InitialWeights", - "Type": "String", - "Desc": "Initial Weights and bias, comma-separated", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "initweights" + "tp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "Shuffle", - "Type": "Bool", - "Desc": "Whether to shuffle for each training iteration", + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", "Aliases": [ - "shuf" + "bs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": -1 }, { - "Name": "StreamingCacheSize", - "Type": "Int", - "Desc": "Size of cache when trained in Scope", + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "cache" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": 1000000 + "Default": false } ], "Outputs": [ @@ -4666,7 +4464,7 @@ "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IMulticlassClassificationOutput", "ITrainerOutput" ] }, @@ -12134,67 +11932,270 @@ "Type": "Float", "Desc": "Run SGD to initialize LR weights, converging to this tolerance", "Aliases": [ - "sgd" + "sgd" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "Quiet", + "Type": "Bool", + "Desc": "If set to true, produce no output during training.", + "Aliases": [ + "q" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Whether or not to use threads. Default is true", + "Aliases": [ + "t" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Number of threads", + "Aliases": [ + "nt" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "DenseOptimizer", + "Type": "Bool", + "Desc": "Force densification of the internal optimization vectors", + "Aliases": [ + "do" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.RegressionEnsemble", + "Desc": "Train regression ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", + "Aliases": [ + "st" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "NumModels", + "Type": "Int", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", + "Aliases": [ + "nm" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", + "Aliases": [ + "pt" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": { + "Name": "AllSelector" + } + }, + { + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionOutputCombiner" + }, + "Desc": "Output combiner", + "Aliases": [ + "oc" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": { + "Name": "Median" + } + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 0.0 + "Default": "Auto" }, { - "Name": "Quiet", + "Name": "TrainParallel", "Type": "Bool", - "Desc": "If set to true, produce no output during training.", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "q" + "tp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, "Default": false }, { - "Name": "UseThreads", - "Type": "Bool", - "Desc": "Whether or not to use threads. Default is true", - "Aliases": [ - "t" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "NumThreads", + "Name": "BatchSize", "Type": "Int", - "Desc": "Number of threads", + "Desc": "Batch size", "Aliases": [ - "nt" + "bs" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 107.0, + "IsNullable": false, + "Default": -1 }, { - "Name": "DenseOptimizer", + "Name": "ShowMetrics", "Type": "Bool", - "Desc": "Force densification of the internal optimization vectors", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "do" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": false } ], "Outputs": [ @@ -12205,7 +12206,6 @@ } ], "InputKind": [ - "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -19380,73 +19380,246 @@ ] }, { - "Kind": "EnsembleDiversityMeasure", + "Kind": "EnsembleBinaryOutputCombiner", "Components": [ { - "Name": "DisagreementDiversityMeasure", + "Name": "Average", "Desc": null, - "FriendlyName": "Disagreement Diversity Measure", + "FriendlyName": "Average", "Settings": [] }, { - "Name": "MultiDisagreementDiversityMeasure", + "Name": "Median", "Desc": null, - "FriendlyName": "Disagreement Diversity Measure", + "FriendlyName": "Median", "Settings": [] }, { - "Name": "RegressionDisagreementDiversityMeasure", + "Name": "Stacking", "Desc": null, - "FriendlyName": "Disagreement Diversity Measure", + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "Voting", + "Desc": null, + "FriendlyName": "Voting", "Settings": [] + }, + { + "Name": "WeightedAverage", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "WeightageName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "Auc", + "PosPrecision", + "PosRecall", + "NegPrecision", + "NegRecall" + ] + }, + "Desc": "The metric type to be used to find the weights for each model", + "Aliases": [ + "wn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + } + ] } ] }, { - "Kind": "EnsembleFeatureSelector", + "Kind": "EnsembleBinarySubModelSelector", "Components": [ { - "Name": "AllFeatureSelector", + "Name": "AllSelector", "Desc": null, - "FriendlyName": "All Feature Selector", + "FriendlyName": "All Selector", "Settings": [] }, { - "Name": "RandomFeatureSelector", + "Name": "BestDiverseSelector", "Desc": null, - "FriendlyName": "Random Feature Selector", + "FriendlyName": "Best Diverse Selector", "Settings": [ { - "Name": "FeaturesSelectionProportion", + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", + "Aliases": [ + "dm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "The proportion of features to be selected. The range is 0.0-1.0", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "fp" + "lp" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": 0.8 + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "BestPerformanceSelector", + "Desc": null, + "FriendlyName": "Best Performance Selector", + "Settings": [ + { + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "PosPrecName", + "PosRecallName", + "NegPrecName", + "NegRecallName", + "Auc", + "LogLoss", + "LogLossReduction", + "F1", + "AuPrc" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + }, + { + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", + "Aliases": [ + "lp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 } ] } ] }, { - "Kind": "EnsembleOutputCombiner", + "Kind": "EnsembleDiversityMeasure", "Components": [ { - "Name": "Average", + "Name": "DisagreementDiversityMeasure", "Desc": null, - "FriendlyName": "Average", + "FriendlyName": "Disagreement Diversity Measure", "Settings": [] }, { - "Name": "Median", + "Name": "MultiDisagreementDiversityMeasure", "Desc": null, - "FriendlyName": "Median", + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] + }, + { + "Name": "RegressionDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] + } + ] + }, + { + "Kind": "EnsembleFeatureSelector", + "Components": [ + { + "Name": "AllFeatureSelector", + "Desc": null, + "FriendlyName": "All Feature Selector", "Settings": [] }, + { + "Name": "RandomFeatureSelector", + "Desc": null, + "FriendlyName": "Random Feature Selector", + "Settings": [ + { + "Name": "FeaturesSelectionProportion", + "Type": "Float", + "Desc": "The proportion of features to be selected. The range is 0.0-1.0", + "Aliases": [ + "fp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.8 + } + ] + } + ] + }, + { + "Kind": "EnsembleMulticlassOutputCombiner", + "Components": [ { "Name": "MultiAverage", "Desc": null, @@ -19559,91 +19732,12 @@ "Default": true } ] - }, - { - "Name": "RegressionStacking", - "Desc": null, - "FriendlyName": "Stacking", - "Settings": [ - { - "Name": "ValidationDatasetProportion", - "Type": "Float", - "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", - "Aliases": [ - "vp" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.3 - } - ] - }, - { - "Name": "Stacking", - "Desc": null, - "FriendlyName": "Stacking", - "Settings": [ - { - "Name": "ValidationDatasetProportion", - "Type": "Float", - "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", - "Aliases": [ - "vp" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.3 - } - ] - }, - { - "Name": "Voting", - "Desc": null, - "FriendlyName": "Voting", - "Settings": [] - }, - { - "Name": "WeightedAverage", - "Desc": null, - "FriendlyName": "Stacking", - "Settings": [ - { - "Name": "WeightageName", - "Type": { - "Kind": "Enum", - "Values": [ - "Accuracy", - "Auc", - "PosPrecision", - "PosRecall", - "NegPrecision", - "NegRecall" - ] - }, - "Desc": "The metric type to be used to find the weights for each model", - "Aliases": [ - "wn" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": "Auc" - } - ] } ] }, { - "Kind": "EnsembleSubModelSelector", + "Kind": "EnsembleMulticlassSubModelSelector", "Components": [ - { - "Name": "AllSelector", - "Desc": null, - "FriendlyName": "All Selector", - "Settings": [] - }, { "Name": "AllSelectorMultiClass", "Desc": null, @@ -19651,7 +19745,7 @@ "Settings": [] }, { - "Name": "BestDiverseSelector", + "Name": "BestDiverseSelectorMultiClass", "Desc": null, "FriendlyName": "Best Diverse Selector", "Settings": [ @@ -19697,24 +19791,29 @@ ] }, { - "Name": "BestDiverseSelectorMultiClass", + "Name": "BestPerformanceSelectorMultiClass", "Desc": null, - "FriendlyName": "Best Diverse Selector", + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "DiversityMetricType", + "Name": "MetricName", "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleDiversityMeasure" + "Kind": "Enum", + "Values": [ + "AccuracyMicro", + "AccuracyMacro", + "LogLoss", + "LogLossReduction" + ] }, - "Desc": "The metric type to be used to find the diversity among base learners", + "Desc": "The metric type to be used to find the best performance", "Aliases": [ - "dm" + "mn" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": null + "Default": "AccuracyMicro" }, { "Name": "LearnersSelectionProportion", @@ -19741,91 +19840,29 @@ "Default": 0.3 } ] + } + ] + }, + { + "Kind": "EnsembleRegressionOutputCombiner", + "Components": [ + { + "Name": "Average", + "Desc": null, + "FriendlyName": "Average", + "Settings": [] }, { - "Name": "BestDiverseSelectorRegression", + "Name": "Median", "Desc": null, - "FriendlyName": "Best Diverse Selector", - "Settings": [ - { - "Name": "DiversityMetricType", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleDiversityMeasure" - }, - "Desc": "The metric type to be used to find the diversity among base learners", - "Aliases": [ - "dm" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "LearnersSelectionProportion", - "Type": "Float", - "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", - "Aliases": [ - "lp" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.5 - }, - { - "Name": "ValidationDatasetProportion", - "Type": "Float", - "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", - "Aliases": [ - "vp" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.3 - } - ] + "FriendlyName": "Median", + "Settings": [] }, { - "Name": "BestPerformanceRegressionSelector", + "Name": "RegressionStacking", "Desc": null, - "FriendlyName": "Best Performance Selector", + "FriendlyName": "Stacking", "Settings": [ - { - "Name": "MetricName", - "Type": { - "Kind": "Enum", - "Values": [ - "L1", - "L2", - "Rms", - "Loss", - "RSquared" - ] - }, - "Desc": "The metric type to be used to find the best performance", - "Aliases": [ - "mn" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": "L1" - }, - { - "Name": "LearnersSelectionProportion", - "Type": "Float", - "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", - "Aliases": [ - "lp" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.5 - }, { "Name": "ValidationDatasetProportion", "Type": "Float", @@ -19839,37 +19876,37 @@ "Default": 0.3 } ] + } + ] + }, + { + "Kind": "EnsembleRegressionSubModelSelector", + "Components": [ + { + "Name": "AllSelector", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] }, { - "Name": "BestPerformanceSelector", + "Name": "BestDiverseSelectorRegression", "Desc": null, - "FriendlyName": "Best Performance Selector", + "FriendlyName": "Best Diverse Selector", "Settings": [ { - "Name": "MetricName", + "Name": "DiversityMetricType", "Type": { - "Kind": "Enum", - "Values": [ - "Accuracy", - "PosPrecName", - "PosRecallName", - "NegPrecName", - "NegRecallName", - "Auc", - "LogLoss", - "LogLossReduction", - "F1", - "AuPrc" - ] + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" }, - "Desc": "The metric type to be used to find the best performance", + "Desc": "The metric type to be used to find the diversity among base learners", "Aliases": [ - "mn" + "dm" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": "Auc" + "Default": null }, { "Name": "LearnersSelectionProportion", @@ -19898,7 +19935,7 @@ ] }, { - "Name": "BestPerformanceSelectorMultiClass", + "Name": "BestPerformanceRegressionSelector", "Desc": null, "FriendlyName": "Best Performance Selector", "Settings": [ @@ -19907,10 +19944,11 @@ "Type": { "Kind": "Enum", "Values": [ - "AccuracyMicro", - "AccuracyMacro", - "LogLoss", - "LogLossReduction" + "L1", + "L2", + "Rms", + "Loss", + "RSquared" ] }, "Desc": "The metric type to be used to find the best performance", @@ -19920,7 +19958,7 @@ "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": "AccuracyMicro" + "Default": "L1" }, { "Name": "LearnersSelectionProportion", diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index a1d51b574f..3134fe43aa 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1778,6 +1778,24 @@ public void EntryPointLinearSVM() TestEntryPointRoutine("iris.txt", "Trainers.LinearSvmBinaryClassifier"); } + [Fact] + public void EntryPointBinaryEnsemble() + { + TestEntryPointRoutine("iris.txt", "Trainers.BinaryEnsemble"); + } + + [Fact] + public void EntryPointClassificationEnsemble() + { + TestEntryPointRoutine("iris.txt", "Trainers.ClassificationEnsemble"); + } + + [Fact] + public void EntryPointRegressionEnsemble() + { + TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.RegressionEnsemble", loader: TestDatasets.winequality.loaderSettings); + } + [Fact] public void EntryPointNaiveBayesMultiClass() { @@ -1790,7 +1808,7 @@ public void EntryPointHogwildSGD() TestEntryPointRoutine("breast-cancer.txt", "Trainers.StochasticGradientDescentBinaryClassifier"); } - [Fact()] + [Fact] public void EntryPointPoissonRegression() { TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.PoissonRegressor", loader: TestDatasets.winequality.loaderSettings); diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index a3cae62e9a..161fd28881 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -150,7 +150,7 @@ public static class TestDatasets name = "wine", trainFilename = "external/winequality-white.csv", testFilename = "external/winequality-white.csv", - loaderSettings = "loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=semicolon header+}" + loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=semicolon header+" }; public static TestDataset msm = new TestDataset From 60208630b94679a39194e5296062fbf583b457ad Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 19 Jun 2018 16:18:02 -0700 Subject: [PATCH 07/20] update solution and core_manifest --- Microsoft.ML.sln | 7 + .../Common/EntryPoints/core_manifest.json | 7532 ++++++++++------- 2 files changed, 4604 insertions(+), 2935 deletions(-) diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 57b077cec5..cc33128892 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -116,6 +116,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "netstandard2.0", "netstanda EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Sweeper.Tests", "test\Microsoft.ML.Sweeper.Tests\Microsoft.ML.Sweeper.Tests.csproj", "{3DEB504D-7A07-48CE-91A2-8047461CB3D4}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.Ensemble", "src\Microsoft.ML.Ensemble\Microsoft.ML.Ensemble.csproj", "{122EE40E-63BE-43A4-81BF-D0CF76B22B47}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -222,6 +224,10 @@ Global {3DEB504D-7A07-48CE-91A2-8047461CB3D4}.Debug|Any CPU.Build.0 = Debug|Any CPU {3DEB504D-7A07-48CE-91A2-8047461CB3D4}.Release|Any CPU.ActiveCfg = Release|Any CPU {3DEB504D-7A07-48CE-91A2-8047461CB3D4}.Release|Any CPU.Build.0 = Release|Any CPU + {122EE40E-63BE-43A4-81BF-D0CF76B22B47}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {122EE40E-63BE-43A4-81BF-D0CF76B22B47}.Debug|Any CPU.Build.0 = Debug|Any CPU + {122EE40E-63BE-43A4-81BF-D0CF76B22B47}.Release|Any CPU.ActiveCfg = Release|Any CPU + {122EE40E-63BE-43A4-81BF-D0CF76B22B47}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -260,6 +266,7 @@ Global {487213C9-E8A9-4F94-85D7-28A05DBBFE3A} = {DEC8F776-49F7-4D87-836C-FE4DC057D08C} {9252A8EB-ABFB-440C-AB4D-1D562753CE0F} = {487213C9-E8A9-4F94-85D7-28A05DBBFE3A} {3DEB504D-7A07-48CE-91A2-8047461CB3D4} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} + {122EE40E-63BE-43A4-81BF-D0CF76B22B47} = {09EADF06-BE25-4228-AB53-95AE3E15B530} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index a5cb656da9..eb9054b783 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -654,6 +654,57 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.AnomalyPipelineEnsemble", + "Desc": "Combine anomaly detection models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Average" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IAnomalyDetectionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.BinaryClassificationEvaluator", "Desc": "Evaluates a binary classification scored dataset.", @@ -967,6 +1018,122 @@ } ] }, + { + "Name": "Models.BinaryEnsemble", + "Desc": "Combine binary classifiers into an ensemble", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + }, + { + "Name": "ValidatePipelines", + "Type": "Bool", + "Desc": "Whether to validate that all the pipelines are identical", + "Aliases": [ + "validate" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Models.BinaryPipelineEnsemble", + "Desc": "Combine binary classification models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.ClassificationEvaluator", "Desc": "Evaluates a multi class classification scored dataset.", @@ -1694,6 +1861,43 @@ "ITransformInput" ] }, + { + "Name": "Models.EnsembleSummary", + "Desc": "Summarize a pipeline ensemble predictor.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor to summarize", + "Aliases": [ + "predictorModel" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "Summaries", + "Type": { + "Kind": "Array", + "ItemType": "DataView" + }, + "Desc": "The summaries of the individual predictors" + }, + { + "Name": "Stats", + "Type": { + "Kind": "Array", + "ItemType": "DataView" + }, + "Desc": "The model statistics of the individual predictors" + } + ] + }, { "Name": "Models.FixedPlattCalibrator", "Desc": "Apply a Platt calibrator with a fixed slope and offset to an input model", @@ -1776,6 +1980,58 @@ "ITrainerOutput" ] }, + { + "Name": "Models.MultiClassPipelineEnsemble", + "Desc": "Combine multiclass classifiers into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.MultiOutputRegressionEvaluator", "Desc": "Evaluates a multi output regression scored dataset.", @@ -2822,6 +3078,69 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.RegressionEnsemble", + "Desc": "Combine regression models into an ensemble", + "FriendlyName": "Regression Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + }, + { + "Name": "ValidatePipelines", + "Type": "Bool", + "Desc": "Whether to validate that all the pipelines are identical", + "Aliases": [ + "validate" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.RegressionEvaluator", "Desc": "Evaluates a regression scored dataset.", @@ -2945,13 +3264,64 @@ ] }, { - "Name": "Models.Summarizer", - "Desc": "Summarize a linear regression predictor.", + "Name": "Models.RegressionPipelineEnsemble", + "Desc": "Combine regression models into an ensemble", "FriendlyName": null, "ShortName": null, "Inputs": [ { - "Name": "PredictorModel", + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Models.Summarizer", + "Desc": "Summarize a linear regression predictor.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", "Type": "PredictorModel", "Desc": "The predictor to summarize", "Aliases": [ @@ -3695,31 +4065,11 @@ ] }, { - "Name": "Trainers.FastForestBinaryClassifier", - "Desc": "Uses a random forest learner to perform binary classification.", - "FriendlyName": "Fast Forest Classification", - "ShortName": "ff", + "Name": "Trainers.BinaryEnsemble", + "Desc": "Train binary ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, "Inputs": [ - { - "Name": "NumTrees", - "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -3732,22 +4082,25 @@ "IsNullable": false }, { - "Name": "NumLeaves", - "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", "Aliases": [ - "nl" + "st" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { @@ -3763,24 +4116,16 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "NumModels", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "mil" + "nm" ], "Required": false, "SortOrder": 3.0, - "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "IsNullable": true, + "Default": null }, { "Name": "LabelColumn", @@ -3795,28 +4140,38 @@ "Default": "Label" }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleBinarySubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "weight" + "pt" ], "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": { + "Name": "AllSelector" + } }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleBinaryOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "groupId" + "oc" ], "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": { + "Name": "Median" + } }, { "Name": "NormalizeFeatures", @@ -3859,419 +4214,578 @@ "Default": "Auto" }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "TrainParallel", + "Type": "Bool", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "mo" + "tp" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100.0 - }, - { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", - "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": false }, { - "Name": "MaxCalibrationExamples", + "Name": "BatchSize", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "Batch size", + "Aliases": [ + "bs" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 1000000 + "Default": -1 }, { - "Name": "QuantileSampleCount", - "Type": "Int", - "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "qsc" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": 100 + "Default": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.ClassificationEnsemble", + "Desc": "Train multiclass ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "ParallelTrainer", + "Name": "SamplingType", "Type": { "Kind": "Component", - "ComponentKind": "ParallelTraining" + "ComponentKind": "EnsembleSubsetSelector" }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Desc": "Sampling Type", "Aliases": [ - "parag" + "st" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, "Default": { - "Name": "Single" + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { - "Name": "NumThreads", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "NumModels", "Type": "Int", - "Desc": "The number of threads to use", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "t" + "nm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": true, "Default": null }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "r1" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": "Label" }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleMulticlassSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "r3" + "pt" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": 123 + "Default": { + "Name": "AllSelectorMultiClass" + } }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleMulticlassOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "e" + "oc" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "MultiMedian" + } }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "ps" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": -1 + "Default": "Auto" }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "dt" + "cache" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" }, { - "Name": "FeatureFlocks", + "Name": "TrainParallel", "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "flocks" + "tp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "CategoricalSplit", + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", + "Aliases": [ + "bs" + ], + "Required": false, + "SortOrder": 107.0, + "IsNullable": false, + "Default": -1 + }, + { + "Name": "ShowMetrics", "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "cat" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, "Default": false - }, + } + ], + "Outputs": [ { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.FastForestBinaryClassifier", + "Desc": "Uses a random forest learner to perform binary classification.", + "FriendlyName": "Fast Forest Classification", + "ShortName": "ff", + "Inputs": [ + { + "Name": "NumTrees", "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ - "mcg" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 64 + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 100, + 500 + ] + } }, { - "Name": "MaxCategoricalSplitPoints", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "NumLeaves", "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "maxcat" + "nl" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 64 + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true + } }, { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "mdop" + "feat" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 0.001 + "Default": "Features" }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "MinDocumentsInLeafs", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "mdo" + "mil" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 100 + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } }, { - "Name": "Bias", - "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "bias" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 0.0 + "Default": "Label" }, { - "Name": "Bundling", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" - ] - }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", "Aliases": [ - "bundle" + "weight" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": "None" + "Default": "Weight" }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", "Aliases": [ - "mb" + "groupId" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 255 + "Default": "GroupId" }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "sp" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.7 + "Default": "Auto" }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "ffup" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 0.0 + "Default": "Auto" }, { - "Name": "FeatureReusePenalty", + "Name": "MaxTreeOutput", "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "frup" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 100.0 }, { - "Name": "GainConfidenceLevel", - "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", - "Aliases": [ - "gainconf" - ], + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "PlattCalibrator" + } }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 + }, + { + "Name": "QuantileSampleCount", + "Type": "Int", + "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", "Aliases": [ - "smtemp" + "qsc" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 100 }, { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "et" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "FeatureFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "ff" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 + "IsNullable": true, + "Default": null }, { - "Name": "BaggingSize", + "Name": "RngSeed", "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", + "Desc": "The seed of the random number generator", "Aliases": [ - "bag" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 123 }, { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", + "Name": "FeatureSelectSeed", + "Type": "Int", + "Desc": "The seed of the active feature selection", "Aliases": [ - "bagfrac" + "r3" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 123 }, { - "Name": "SplitFraction", + "Name": "EntropyCoefficient", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "sf" + "e" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 0.0 }, { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", + "Name": "HistogramPoolSize", + "Type": "Int", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "s" + "ps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": -1 }, { - "Name": "AllowEmptyTrees", + "Name": "DiskTranspose", "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "allowempty", - "dummies" + "dt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": true + "IsNullable": true, + "Default": null }, { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "fcomp" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": true }, { - "Name": "CompressEnsemble", + "Name": "CategoricalSplit", "Type": "Bool", - "Desc": "Compress the tree Ensemble", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "cmp" + "cat" ], "Required": false, "SortOrder": 150.0, @@ -4279,364 +4793,259 @@ "Default": false }, { - "Name": "MaxTreesAfterCompression", + "Name": "MaxCategoricalGroupsPerNode", "Type": "Int", - "Desc": "Maximum Number of trees after compression", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "cmpmax" + "mcg" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": 64 }, { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "graph" + "maxcat" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 64 }, { - "Name": "PrintTrainValidGraph", - "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Name": "MinDocsPercentageForCategoricalSplit", + "Type": "Float", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "graphtv" + "mdop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.001 }, { - "Name": "TestFrequency", + "Name": "MinDocsForCategoricalSplit", "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "tf" + "mdo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IBinaryClassificationOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.FastForestRegressor", - "Desc": "Trains a random forest to fit target values using least-squares.", - "FriendlyName": "FastForestRegression", - "ShortName": "ffr", - "Inputs": [ + "Default": 100 + }, { - "Name": "NumTrees", - "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "iter" + "bias" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } + "Default": 0.0 }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "Bundling", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "AggregateLowPopulation", + "Adjacent" + ] + }, + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "data" + "bundle" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "None" }, { - "Name": "NumLeaves", + "Name": "MaxBins", "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "nl" + "mb" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true - } + "Default": 255 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "feat" + "sp" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" - }, - { - "Name": "MinDocumentsInLeafs", - "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", - "Aliases": [ - "mil" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } - }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", - "Aliases": [ - "lab" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": "Label" - }, - { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", - "Aliases": [ - "weight" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": "Weight" + "Default": 0.7 }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "groupId" + "ffup" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "GroupId" + "Default": 0.0 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "norm" + "frup" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.0 }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "cache" + "gainconf" ], "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "ShuffleLabels", - "Type": "Bool", - "Desc": "Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass.", - "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "QuantileSampleCount", - "Type": "Int", - "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "qsc" + "smtemp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 0.0 }, { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "parag" + "et" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "Single" - } + "Default": false }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "FeatureFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "t" + "ff" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0.7 }, { - "Name": "RngSeed", + "Name": "BaggingSize", "Type": "Int", - "Desc": "The seed of the random number generator", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "r1" + "bag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": 1 }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "BaggingTrainFraction", + "Type": "Float", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "r3" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": 0.7 }, { - "Name": "EntropyCoefficient", + "Name": "SplitFraction", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "e" + "sf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.7 }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "Smoothing", + "Type": "Float", + "Desc": "Smoothing paramter for tree regularization", "Aliases": [ - "ps" + "s" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": 0.0 }, { - "Name": "DiskTranspose", + "Name": "AllowEmptyTrees", "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Desc": "When a root split is impossible, allow training to proceed", "Aliases": [ - "dt" + "allowempty", + "dummies" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": true }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "FeatureCompressionLevel", + "Type": "Int", + "Desc": "The level of feature compression to use", "Aliases": [ - "flocks" + "fcomp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1 }, { - "Name": "CategoricalSplit", + "Name": "CompressEnsemble", "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "cat" + "cmp" ], "Required": false, "SortOrder": 150.0, @@ -4644,568 +5053,515 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "MaxTreesAfterCompression", "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Desc": "Maximum Number of trees after compression", "Aliases": [ - "mcg" + "cmpmax" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": -1 }, { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Name": "PrintTestGraph", + "Type": "Bool", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "maxcat" + "graph" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Name": "PrintTrainValidGraph", + "Type": "Bool", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "mdop" + "graphtv" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001 + "Default": false }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "TestFrequency", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "mdo" + "tf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 - }, + "Default": 2147483647 + } + ], + "Outputs": [ { - "Name": "Bias", - "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", - "Aliases": [ - "bias" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.FastForestRegressor", + "Desc": "Trains a random forest to fit target values using least-squares.", + "FriendlyName": "FastForestRegression", + "ShortName": "ffr", + "Inputs": [ { - "Name": "Bundling", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" - ] - }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Name": "NumTrees", + "Type": "Int", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ - "bundle" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": "None" + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 100, + 500 + ] + } }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "mb" + "data" ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 255 + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "NumLeaves", + "Type": "Int", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "sp" + "nl" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 0.7 + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true + } }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "ffup" + "feat" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 0.0 + "Default": "Features" }, { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Name": "MinDocumentsInLeafs", + "Type": "Int", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "frup" + "mil" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 0.0 + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } }, { - "Name": "GainConfidenceLevel", - "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "gainconf" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 0.0 + "Default": "Label" }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", "Aliases": [ - "smtemp" + "weight" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": 0.0 + "Default": "Weight" }, { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", "Aliases": [ - "et" + "groupId" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": false + "Default": "GroupId" }, { - "Name": "FeatureFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "ff" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.7 + "Default": "Auto" }, { - "Name": "BaggingSize", - "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "bag" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 1 + "Default": "Auto" }, { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", - "Aliases": [ - "bagfrac" - ], + "Name": "ShuffleLabels", + "Type": "Bool", + "Desc": "Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass.", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": false }, { - "Name": "SplitFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", + "Name": "QuantileSampleCount", + "Type": "Int", + "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", "Aliases": [ - "sf" + "qsc" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 100 }, { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "s" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "Single" + } }, { - "Name": "AllowEmptyTrees", - "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "allowempty", - "dummies" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": true + "IsNullable": true, + "Default": null }, { - "Name": "FeatureCompressionLevel", + "Name": "RngSeed", "Type": "Int", - "Desc": "The level of feature compression to use", + "Desc": "The seed of the random number generator", "Aliases": [ - "fcomp" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 123 }, { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", + "Name": "FeatureSelectSeed", + "Type": "Int", + "Desc": "The seed of the active feature selection", "Aliases": [ - "cmp" + "r3" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 123 }, { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "cmpmax" + "e" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": 0.0 }, { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", + "Name": "HistogramPoolSize", + "Type": "Int", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "graph" + "ps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": -1 }, { - "Name": "PrintTrainValidGraph", + "Name": "DiskTranspose", "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "graphtv" + "dt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "TestFrequency", - "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "tf" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRegressionOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.FastTreeBinaryClassifier", - "Desc": "Uses a logit-boost boosted tree learner to perform binary classification.", - "FriendlyName": "FastTree (Boosted Trees) Classification", - "ShortName": "ftc", - "Inputs": [ - { - "Name": "NumTrees", - "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } - }, - { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "NumLeaves", - "Type": "Int", - "Desc": "The max number of leaves in each regression tree", - "Aliases": [ - "nl" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true - } + "Default": true }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "feat" + "cat" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": false }, { - "Name": "MinDocumentsInLeafs", + "Name": "MaxCategoricalGroupsPerNode", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "mil" + "mcg" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "Default": 64 }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "lab" + "maxcat" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": 64 }, { - "Name": "LearningRates", + "Name": "MinDocsPercentageForCategoricalSplit", "Type": "Float", - "Desc": "The learning rate", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "lr" + "mdop" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.2, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 0.4, - "IsLogScale": true - } + "Default": 0.001 }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MinDocsForCategoricalSplit", + "Type": "Int", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "weight" + "mdo" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 100 }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "groupId" + "bias" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "GroupId" + "Default": 0.0 }, { - "Name": "NormalizeFeatures", + "Name": "Bundling", "Type": { "Kind": "Enum", "Values": [ - "No", - "Warn", - "Auto", - "Yes" + "None", + "AggregateLowPopulation", + "Adjacent" ] }, - "Desc": "Normalize option for the feature column", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "norm" + "bundle" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": "None" }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "cache" + "mb" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 255 }, { - "Name": "UnbalancedSets", - "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "us" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.7 }, { - "Name": "BestStepRankingRegressionTrees", - "Type": "Bool", - "Desc": "Use best regression step trees?", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "bsr" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "UseLineSearch", - "Type": "Bool", - "Desc": "Should we use line search for a step size", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "ls" + "frup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "NumPostBracketSteps", - "Type": "Int", - "Desc": "Number of post-bracket line search steps", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "lssteps" + "gainconf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 0.0 }, { - "Name": "MinStepSize", + "Name": "SoftmaxTemperature", "Type": "Float", - "Desc": "Minimum line search step size", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "minstep" + "smtemp" ], "Required": false, "SortOrder": 150.0, @@ -5213,145 +5569,96 @@ "Default": 0.0 }, { - "Name": "OptimizationAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "GradientDescent", - "AcceleratedGradientDescent", - "ConjugateGradientDescent" - ] - }, - "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "oa" + "et" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "GradientDescent" + "Default": false }, { - "Name": "EarlyStoppingRule", - "Type": { - "Kind": "Component", - "ComponentKind": "EarlyStoppingCriterion" - }, - "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Name": "FeatureFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "esr" + "ff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.7 }, { - "Name": "EarlyStoppingMetrics", + "Name": "BaggingSize", "Type": "Int", - "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", - "Aliases": [ - "esmt" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "EnablePruning", - "Type": "Bool", - "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", - "Aliases": [ - "pruning" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "UseTolerantPruning", - "Type": "Bool", - "Desc": "Use window and tolerance for pruning", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "prtol" + "bag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1 }, { - "Name": "PruningThreshold", + "Name": "BaggingTrainFraction", "Type": "Float", - "Desc": "The tolerance threshold for pruning", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "prth" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.004 + "Default": 0.7 }, { - "Name": "PruningWindowSize", - "Type": "Int", - "Desc": "The moving window size for pruning", + "Name": "SplitFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "prws" + "sf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 5 + "Default": 0.7 }, { - "Name": "Shrinkage", + "Name": "Smoothing", "Type": "Float", - "Desc": "Shrinkage", + "Desc": "Smoothing paramter for tree regularization", "Aliases": [ - "shrk" + "s" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 4.0, - "IsLogScale": true - } + "Default": 0.0 }, { - "Name": "DropoutRate", - "Type": "Float", - "Desc": "Dropout rate for tree regularization", + "Name": "AllowEmptyTrees", + "Type": "Bool", + "Desc": "When a root split is impossible, allow training to proceed", "Aliases": [ - "tdrop" + "allowempty", + "dummies" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 1E-09, - 0.05, - 0.1, - 0.2 - ] - } + "Default": true }, { - "Name": "GetDerivativesSampleRate", + "Name": "FeatureCompressionLevel", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "The level of feature compression to use", "Aliases": [ - "sr" + "fcomp" ], "Required": false, "SortOrder": 150.0, @@ -5359,11 +5666,11 @@ "Default": 1 }, { - "Name": "WriteLastEnsemble", + "Name": "CompressEnsemble", "Type": "Bool", - "Desc": "Write the last ensemble instead of the one determined by early stopping", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "hl" + "cmp" ], "Required": false, "SortOrder": 150.0, @@ -5371,23 +5678,23 @@ "Default": false }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "MaxTreesAfterCompression", + "Type": "Int", + "Desc": "Maximum Number of trees after compression", "Aliases": [ - "mo" + "cmpmax" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100.0 + "Default": -1 }, { - "Name": "RandomStart", + "Name": "PrintTestGraph", "Type": "Bool", - "Desc": "Training starts from random ordering (determined by /r1)", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "rs" + "graph" ], "Required": false, "SortOrder": 150.0, @@ -5395,11 +5702,11 @@ "Default": false }, { - "Name": "FilterZeroLambdas", + "Name": "PrintTrainValidGraph", "Type": "Bool", - "Desc": "Filter zero lambdas during training", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "fzl" + "graphtv" ], "Required": false, "SortOrder": 150.0, @@ -5407,208 +5714,272 @@ "Default": false }, { - "Name": "BaselineScoresFormula", - "Type": "String", - "Desc": "Freeform defining the scores that should be used as the baseline ranker", + "Name": "TestFrequency", + "Type": "Int", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "basescores" + "tf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null - }, + "Default": 2147483647 + } + ], + "Outputs": [ { - "Name": "BaselineAlphaRisk", - "Type": "String", - "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.FastTreeBinaryClassifier", + "Desc": "Uses a logit-boost boosted tree learner to perform binary classification.", + "FriendlyName": "FastTree (Boosted Trees) Classification", + "ShortName": "ftc", + "Inputs": [ + { + "Name": "NumTrees", + "Type": "Int", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ - "basealpha" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": null + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 100, + 500 + ] + } }, { - "Name": "PositionDiscountFreeform", - "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "pdff" + "data" ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "NumLeaves", + "Type": "Int", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "parag" + "nl" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": { - "Name": "Single" + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true } }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "t" + "feat" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" }, { - "Name": "RngSeed", + "Name": "MinDocumentsInLeafs", "Type": "Int", - "Desc": "The seed of the random number generator", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "r1" + "mil" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "r3" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": "Label" }, { - "Name": "EntropyCoefficient", + "Name": "LearningRates", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "The learning rate", "Aliases": [ - "e" + "lr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.2, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 0.4, + "IsLogScale": true + } }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", "Aliases": [ - "ps" + "weight" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": -1 + "Default": "Weight" }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", "Aliases": [ - "dt" + "groupId" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 5.0, + "IsNullable": false, + "Default": "GroupId" }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "flocks" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": true + "Default": "Auto" }, { - "Name": "CategoricalSplit", - "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "cat" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": false + "Default": "Auto" }, { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Name": "UnbalancedSets", + "Type": "Bool", + "Desc": "Should we use derivatives optimized for unbalanced sets", "Aliases": [ - "mcg" + "us" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Name": "BestStepRankingRegressionTrees", + "Type": "Bool", + "Desc": "Use best regression step trees?", "Aliases": [ - "maxcat" + "bsr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Name": "UseLineSearch", + "Type": "Bool", + "Desc": "Should we use line search for a step size", "Aliases": [ - "mdop" + "ls" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001 + "Default": false }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "NumPostBracketSteps", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Number of post-bracket line search steps", "Aliases": [ - "mdo" + "lssteps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 0 }, { - "Name": "Bias", + "Name": "MinStepSize", "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Desc": "Minimum line search step size", "Aliases": [ - "bias" + "minstep" ], "Required": false, "SortOrder": 150.0, @@ -5616,529 +5987,505 @@ "Default": 0.0 }, { - "Name": "Bundling", + "Name": "OptimizationAlgorithm", "Type": { "Kind": "Enum", "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" + "GradientDescent", + "AcceleratedGradientDescent", + "ConjugateGradientDescent" ] }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", "Aliases": [ - "bundle" + "oa" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "None" + "Default": "GradientDescent" }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "EarlyStoppingRule", + "Type": { + "Kind": "Component", + "ComponentKind": "EarlyStoppingCriterion" + }, + "Desc": "Early stopping rule. (Validation set (/valid) is required.)", "Aliases": [ - "mb" + "esr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": null }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "EarlyStoppingMetrics", + "Type": "Int", + "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", "Aliases": [ - "sp" + "esmt" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 0 }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "EnablePruning", + "Type": "Bool", + "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", "Aliases": [ - "ffup" + "pruning" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Name": "UseTolerantPruning", + "Type": "Bool", + "Desc": "Use window and tolerance for pruning", "Aliases": [ - "frup" + "prtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "GainConfidenceLevel", + "Name": "PruningThreshold", "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Desc": "The tolerance threshold for pruning", "Aliases": [ - "gainconf" + "prth" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.004 }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "PruningWindowSize", + "Type": "Int", + "Desc": "The moving window size for pruning", "Aliases": [ - "smtemp" + "prws" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 5 }, { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Name": "Shrinkage", + "Type": "Float", + "Desc": "Shrinkage", "Aliases": [ - "et" + "shrk" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 4.0, + "IsLogScale": true + } }, { - "Name": "FeatureFraction", + "Name": "DropoutRate", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Desc": "Dropout rate for tree regularization", "Aliases": [ - "ff" + "tdrop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 1E-09, + 0.05, + 0.1, + 0.2 + ] + } }, { - "Name": "BaggingSize", + "Name": "GetDerivativesSampleRate", "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "bag" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", + "Name": "WriteLastEnsemble", + "Type": "Bool", + "Desc": "Write the last ensemble instead of the one determined by early stopping", "Aliases": [ - "bagfrac" + "hl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": false }, { - "Name": "SplitFraction", + "Name": "MaxTreeOutput", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "sf" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 100.0 }, { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", + "Name": "RandomStart", + "Type": "Bool", + "Desc": "Training starts from random ordering (determined by /r1)", "Aliases": [ - "s" + "rs" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "AllowEmptyTrees", + "Name": "FilterZeroLambdas", "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", + "Desc": "Filter zero lambdas during training", "Aliases": [ - "allowempty", - "dummies" + "fzl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", + "Name": "BaselineScoresFormula", + "Type": "String", + "Desc": "Freeform defining the scores that should be used as the baseline ranker", "Aliases": [ - "fcomp" + "basescores" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": null }, { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", + "Name": "BaselineAlphaRisk", + "Type": "String", + "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", "Aliases": [ - "cmp" + "basealpha" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": null }, { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", + "Name": "PositionDiscountFreeform", + "Type": "String", + "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ - "cmpmax" + "pdff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": null }, { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "graph" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "PrintTrainValidGraph", - "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "graphtv" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "TestFrequency", + "Name": "RngSeed", "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Desc": "The seed of the random number generator", "Aliases": [ - "tf" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IBinaryClassificationOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.FastTreeRanker", - "Desc": "Trains gradient boosted decision trees to the LambdaRank quasi-gradient.", - "FriendlyName": "FastTree (Boosted Trees) Ranking", - "ShortName": "ftrank", - "Inputs": [ + "Default": 123 + }, { - "Name": "NumTrees", + "Name": "FeatureSelectSeed", "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", + "Desc": "The seed of the active feature selection", "Aliases": [ - "iter" + "r3" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } + "Default": 123 }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "data" + "e" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "NumLeaves", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "nl" + "ps" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true - } + "Default": -1 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "feat" + "dt" ], "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": "Features" + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "MinDocumentsInLeafs", - "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "mil" + "flocks" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "Default": true }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "lab" + "cat" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": false }, { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", + "Name": "MaxCategoricalGroupsPerNode", + "Type": "Int", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "lr" + "mcg" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.2, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 0.4, - "IsLogScale": true - } + "Default": 64 }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "weight" + "maxcat" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 64 }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "MinDocsPercentageForCategoricalSplit", + "Type": "Float", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "groupId" + "mdop" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "GroupId" + "Default": 0.001 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "MinDocsForCategoricalSplit", + "Type": "Int", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "norm" + "mdo" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 100 }, { - "Name": "Caching", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Aliases": [ + "bias" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "Bundling", "Type": { "Kind": "Enum", "Values": [ - "Auto", - "Memory", - "Disk", - "None" + "None", + "AggregateLowPopulation", + "Adjacent" ] }, - "Desc": "Whether learner should cache input training data", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "cache" + "bundle" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": "None" }, { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "gains" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "0,3,7,15,31" + "Default": 255 }, { - "Name": "TrainDcg", - "Type": "Bool", - "Desc": "Train DCG instead of NDCG", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "dcg" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.7 }, { - "Name": "SortingAlgorithm", - "Type": "String", - "Desc": "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "sort" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "DescendingStablePessimistic" + "Default": 0.0 }, { - "Name": "LambdaMartMaxTruncation", - "Type": "Int", - "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "n" + "frup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 0.0 }, { - "Name": "ShiftedNdcg", - "Type": "Bool", - "Desc": "Use shifted NDCG", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Aliases": [ + "gainconf" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "CostFunctionParam", - "Type": "Char", - "Desc": "Cost function parameter (w/c)", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "cf" + "smtemp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "w" + "Default": 0.0 }, { - "Name": "DistanceWeight2", + "Name": "ExecutionTimes", "Type": "Bool", - "Desc": "Distance weight 2 adjustment to cost", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "dw" + "et" ], "Required": false, "SortOrder": 150.0, @@ -6146,59 +6493,59 @@ "Default": false }, { - "Name": "NormalizeQueryLambdas", - "Type": "Bool", - "Desc": "Normalize query lambdas", + "Name": "FeatureFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "nql" + "ff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1.0 }, { - "Name": "BestStepRankingRegressionTrees", - "Type": "Bool", - "Desc": "Use best regression step trees?", + "Name": "BaggingSize", + "Type": "Int", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "bsr" + "bag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0 }, { - "Name": "UseLineSearch", - "Type": "Bool", - "Desc": "Should we use line search for a step size", + "Name": "BaggingTrainFraction", + "Type": "Float", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "ls" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.7 }, { - "Name": "NumPostBracketSteps", - "Type": "Int", - "Desc": "Number of post-bracket line search steps", + "Name": "SplitFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "lssteps" + "sf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1.0 }, { - "Name": "MinStepSize", + "Name": "Smoothing", "Type": "Float", - "Desc": "Minimum line search step size", + "Desc": "Smoothing paramter for tree regularization", "Aliases": [ - "minstep" + "s" ], "Required": false, "SortOrder": 150.0, @@ -6206,69 +6553,60 @@ "Default": 0.0 }, { - "Name": "OptimizationAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "GradientDescent", - "AcceleratedGradientDescent", - "ConjugateGradientDescent" - ] - }, - "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", + "Name": "AllowEmptyTrees", + "Type": "Bool", + "Desc": "When a root split is impossible, allow training to proceed", "Aliases": [ - "oa" + "allowempty", + "dummies" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "GradientDescent" + "Default": true }, { - "Name": "EarlyStoppingRule", - "Type": { - "Kind": "Component", - "ComponentKind": "EarlyStoppingCriterion" - }, - "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Name": "FeatureCompressionLevel", + "Type": "Int", + "Desc": "The level of feature compression to use", "Aliases": [ - "esr" + "fcomp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 1 }, { - "Name": "EarlyStoppingMetrics", - "Type": "Int", - "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", + "Name": "CompressEnsemble", + "Type": "Bool", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "esmt" + "cmp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": false }, { - "Name": "EnablePruning", - "Type": "Bool", - "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", + "Name": "MaxTreesAfterCompression", + "Type": "Int", + "Desc": "Maximum Number of trees after compression", "Aliases": [ - "pruning" + "cmpmax" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": -1 }, { - "Name": "UseTolerantPruning", + "Name": "PrintTestGraph", "Type": "Bool", - "Desc": "Use window and tolerance for pruning", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "prtol" + "graph" ], "Required": false, "SortOrder": 150.0, @@ -6276,272 +6614,305 @@ "Default": false }, { - "Name": "PruningThreshold", - "Type": "Float", - "Desc": "The tolerance threshold for pruning", + "Name": "PrintTrainValidGraph", + "Type": "Bool", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "prth" + "graphtv" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.004 + "Default": false }, { - "Name": "PruningWindowSize", + "Name": "TestFrequency", "Type": "Int", - "Desc": "The moving window size for pruning", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "prws" + "tf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 5 - }, + "Default": 2147483647 + } + ], + "Outputs": [ { - "Name": "Shrinkage", - "Type": "Float", - "Desc": "Shrinkage", - "Aliases": [ - "shrk" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 4.0, - "IsLogScale": true - } - }, + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.FastTreeRanker", + "Desc": "Trains gradient boosted decision trees to the LambdaRank quasi-gradient.", + "FriendlyName": "FastTree (Boosted Trees) Ranking", + "ShortName": "ftrank", + "Inputs": [ { - "Name": "DropoutRate", - "Type": "Float", - "Desc": "Dropout rate for tree regularization", + "Name": "NumTrees", + "Type": "Int", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ - "tdrop" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 0.0, + "Default": 100, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.0, - 1E-09, - 0.05, - 0.1, - 0.2 + 20, + 100, + 500 ] } }, { - "Name": "GetDerivativesSampleRate", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "NumLeaves", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "sr" + "nl" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 1 + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true + } }, { - "Name": "WriteLastEnsemble", - "Type": "Bool", - "Desc": "Write the last ensemble instead of the one determined by early stopping", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "hl" + "feat" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": false + "Default": "Features" }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "MinDocumentsInLeafs", + "Type": "Int", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "mo" + "mil" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 100.0 + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } }, { - "Name": "RandomStart", - "Type": "Bool", - "Desc": "Training starts from random ordering (determined by /r1)", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "rs" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": false + "Default": "Label" }, { - "Name": "FilterZeroLambdas", - "Type": "Bool", - "Desc": "Filter zero lambdas during training", + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", "Aliases": [ - "fzl" + "lr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": false + "Default": 0.2, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 0.4, + "IsLogScale": true + } }, { - "Name": "BaselineScoresFormula", + "Name": "WeightColumn", "Type": "String", - "Desc": "Freeform defining the scores that should be used as the baseline ranker", + "Desc": "Column to use for example weight", "Aliases": [ - "basescores" + "weight" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": null + "Default": "Weight" }, { - "Name": "BaselineAlphaRisk", + "Name": "GroupIdColumn", "Type": "String", - "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", + "Desc": "Column to use for example groupId", "Aliases": [ - "basealpha" + "groupId" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": null + "Default": "GroupId" }, { - "Name": "PositionDiscountFreeform", - "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "pdff" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": null + "Default": "Auto" }, { - "Name": "ParallelTrainer", + "Name": "Caching", "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Desc": "Whether learner should cache input training data", "Aliases": [ - "parag" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": { - "Name": "Single" - } + "Default": "Auto" }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "CustomGains", + "Type": "String", + "Desc": "Comma seperated list of gains associated to each relevance label.", "Aliases": [ - "t" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", - "Aliases": [ - "r1" + "gains" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": "0,3,7,15,31" }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "TrainDcg", + "Type": "Bool", + "Desc": "Train DCG instead of NDCG", "Aliases": [ - "r3" + "dcg" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": false }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "SortingAlgorithm", + "Type": "String", + "Desc": "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", "Aliases": [ - "e" + "sort" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": "DescendingStablePessimistic" }, { - "Name": "HistogramPoolSize", + "Name": "LambdaMartMaxTruncation", "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", "Aliases": [ - "ps" + "n" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": 100 }, { - "Name": "DiskTranspose", + "Name": "ShiftedNdcg", "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", - "Aliases": [ - "dt" - ], + "Desc": "Use shifted NDCG", "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": false }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "CostFunctionParam", + "Type": "Char", + "Desc": "Cost function parameter (w/c)", "Aliases": [ - "flocks" + "cf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": "w" }, { - "Name": "CategoricalSplit", + "Name": "DistanceWeight2", "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Desc": "Distance weight 2 adjustment to cost", "Aliases": [ - "cat" + "dw" ], "Required": false, "SortOrder": 150.0, @@ -6549,59 +6920,59 @@ "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Name": "NormalizeQueryLambdas", + "Type": "Bool", + "Desc": "Normalize query lambdas", "Aliases": [ - "mcg" + "nql" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Name": "BestStepRankingRegressionTrees", + "Type": "Bool", + "Desc": "Use best regression step trees?", "Aliases": [ - "maxcat" + "bsr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Name": "UseLineSearch", + "Type": "Bool", + "Desc": "Should we use line search for a step size", "Aliases": [ - "mdop" + "ls" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001 + "Default": false }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "NumPostBracketSteps", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Number of post-bracket line search steps", "Aliases": [ - "mdo" + "lssteps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 0 }, { - "Name": "Bias", + "Name": "MinStepSize", "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Desc": "Minimum line search step size", "Aliases": [ - "bias" + "minstep" ], "Required": false, "SortOrder": 150.0, @@ -6609,566 +6980,505 @@ "Default": 0.0 }, { - "Name": "Bundling", + "Name": "OptimizationAlgorithm", "Type": { "Kind": "Enum", "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" + "GradientDescent", + "AcceleratedGradientDescent", + "ConjugateGradientDescent" ] }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", "Aliases": [ - "bundle" + "oa" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "None" + "Default": "GradientDescent" }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "EarlyStoppingRule", + "Type": { + "Kind": "Component", + "ComponentKind": "EarlyStoppingCriterion" + }, + "Desc": "Early stopping rule. (Validation set (/valid) is required.)", "Aliases": [ - "mb" + "esr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": null }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "EarlyStoppingMetrics", + "Type": "Int", + "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", "Aliases": [ - "sp" + "esmt" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 1 }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "EnablePruning", + "Type": "Bool", + "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", "Aliases": [ - "ffup" + "pruning" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Name": "UseTolerantPruning", + "Type": "Bool", + "Desc": "Use window and tolerance for pruning", "Aliases": [ - "frup" + "prtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "GainConfidenceLevel", + "Name": "PruningThreshold", "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Desc": "The tolerance threshold for pruning", "Aliases": [ - "gainconf" + "prth" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.004 }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "PruningWindowSize", + "Type": "Int", + "Desc": "The moving window size for pruning", "Aliases": [ - "smtemp" + "prws" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 5 }, { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Name": "Shrinkage", + "Type": "Float", + "Desc": "Shrinkage", "Aliases": [ - "et" + "shrk" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 4.0, + "IsLogScale": true + } }, { - "Name": "FeatureFraction", + "Name": "DropoutRate", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Desc": "Dropout rate for tree regularization", "Aliases": [ - "ff" + "tdrop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 1E-09, + 0.05, + 0.1, + 0.2 + ] + } }, { - "Name": "BaggingSize", + "Name": "GetDerivativesSampleRate", "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "bag" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", + "Name": "WriteLastEnsemble", + "Type": "Bool", + "Desc": "Write the last ensemble instead of the one determined by early stopping", "Aliases": [ - "bagfrac" + "hl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": false }, { - "Name": "SplitFraction", + "Name": "MaxTreeOutput", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "sf" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 100.0 }, { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", + "Name": "RandomStart", + "Type": "Bool", + "Desc": "Training starts from random ordering (determined by /r1)", "Aliases": [ - "s" + "rs" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "AllowEmptyTrees", + "Name": "FilterZeroLambdas", "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", + "Desc": "Filter zero lambdas during training", "Aliases": [ - "allowempty", - "dummies" + "fzl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", + "Name": "BaselineScoresFormula", + "Type": "String", + "Desc": "Freeform defining the scores that should be used as the baseline ranker", "Aliases": [ - "fcomp" + "basescores" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": null }, { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", + "Name": "BaselineAlphaRisk", + "Type": "String", + "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", "Aliases": [ - "cmp" + "basealpha" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": null }, { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", + "Name": "PositionDiscountFreeform", + "Type": "String", + "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ - "cmpmax" + "pdff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": null }, { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "graph" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "PrintTrainValidGraph", - "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "graphtv" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "TestFrequency", + "Name": "RngSeed", "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Desc": "The seed of the random number generator", "Aliases": [ - "tf" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRankingOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.FastTreeRegressor", - "Desc": "Trains gradient boosted decision trees to fit target values using least-squares.", - "FriendlyName": "FastTree (Boosted Trees) Regression", - "ShortName": "ftr", - "Inputs": [ + "Default": 123 + }, { - "Name": "NumTrees", + "Name": "FeatureSelectSeed", "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", + "Desc": "The seed of the active feature selection", "Aliases": [ - "iter" + "r3" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } + "Default": 123 }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "data" + "e" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "NumLeaves", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "nl" + "ps" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true - } + "Default": -1 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "feat" + "dt" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Aliases": [ + "flocks" + ], + "Required": false, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": true }, { - "Name": "MinDocumentsInLeafs", - "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "mil" + "cat" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "Default": false }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "MaxCategoricalGroupsPerNode", + "Type": "Int", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "lab" + "mcg" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": 64 }, { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "lr" + "maxcat" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.2, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 0.4, - "IsLogScale": true - } + "Default": 64 }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MinDocsPercentageForCategoricalSplit", + "Type": "Float", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "weight" + "mdop" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 0.001 }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "MinDocsForCategoricalSplit", + "Type": "Int", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "groupId" + "mdo" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "GroupId" + "Default": 100 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "norm" + "bias" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.0 }, { - "Name": "Caching", + "Name": "Bundling", "Type": { "Kind": "Enum", "Values": [ - "Auto", - "Memory", - "Disk", - "None" + "None", + "AggregateLowPopulation", + "Adjacent" ] }, - "Desc": "Whether learner should cache input training data", - "Aliases": [ - "cache" - ], - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "BestStepRankingRegressionTrees", - "Type": "Bool", - "Desc": "Use best regression step trees?", - "Aliases": [ - "bsr" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "UseLineSearch", - "Type": "Bool", - "Desc": "Should we use line search for a step size", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "ls" + "bundle" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": "None" }, { - "Name": "NumPostBracketSteps", + "Name": "MaxBins", "Type": "Int", - "Desc": "Number of post-bracket line search steps", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "lssteps" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 255 }, { - "Name": "MinStepSize", + "Name": "SparsifyThreshold", "Type": "Float", - "Desc": "Minimum line search step size", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "minstep" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.7 }, { - "Name": "OptimizationAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "GradientDescent", - "AcceleratedGradientDescent", - "ConjugateGradientDescent" - ] - }, - "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "oa" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "GradientDescent" + "Default": 0.0 }, { - "Name": "EarlyStoppingRule", - "Type": { - "Kind": "Component", - "ComponentKind": "EarlyStoppingCriterion" - }, - "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "esr" + "frup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "EarlyStoppingMetrics", - "Type": "Int", - "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "esmt" + "gainconf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 0.0 }, { - "Name": "EnablePruning", - "Type": "Bool", - "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "pruning" + "smtemp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "UseTolerantPruning", + "Name": "ExecutionTimes", "Type": "Bool", - "Desc": "Use window and tolerance for pruning", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "prtol" + "et" ], "Required": false, "SortOrder": 150.0, @@ -7176,111 +7486,96 @@ "Default": false }, { - "Name": "PruningThreshold", + "Name": "FeatureFraction", "Type": "Float", - "Desc": "The tolerance threshold for pruning", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "prth" + "ff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.004 + "Default": 1.0 }, { - "Name": "PruningWindowSize", + "Name": "BaggingSize", "Type": "Int", - "Desc": "The moving window size for pruning", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "prws" + "bag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 5 + "Default": 0 }, { - "Name": "Shrinkage", + "Name": "BaggingTrainFraction", "Type": "Float", - "Desc": "Shrinkage", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "shrk" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 4.0, - "IsLogScale": true - } + "Default": 0.7 }, { - "Name": "DropoutRate", + "Name": "SplitFraction", "Type": "Float", - "Desc": "Dropout rate for tree regularization", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "tdrop" + "sf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 1E-09, - 0.05, - 0.1, - 0.2 - ] - } + "Default": 1.0 }, { - "Name": "GetDerivativesSampleRate", - "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Name": "Smoothing", + "Type": "Float", + "Desc": "Smoothing paramter for tree regularization", "Aliases": [ - "sr" + "s" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 0.0 }, { - "Name": "WriteLastEnsemble", + "Name": "AllowEmptyTrees", "Type": "Bool", - "Desc": "Write the last ensemble instead of the one determined by early stopping", + "Desc": "When a root split is impossible, allow training to proceed", "Aliases": [ - "hl" + "allowempty", + "dummies" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "FeatureCompressionLevel", + "Type": "Int", + "Desc": "The level of feature compression to use", "Aliases": [ - "mo" + "fcomp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100.0 + "Default": 1 }, { - "Name": "RandomStart", + "Name": "CompressEnsemble", "Type": "Bool", - "Desc": "Training starts from random ordering (determined by /r1)", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "rs" + "cmp" ], "Required": false, "SortOrder": 150.0, @@ -7288,220 +7583,296 @@ "Default": false }, { - "Name": "FilterZeroLambdas", - "Type": "Bool", - "Desc": "Filter zero lambdas during training", + "Name": "MaxTreesAfterCompression", + "Type": "Int", + "Desc": "Maximum Number of trees after compression", "Aliases": [ - "fzl" + "cmpmax" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": -1 }, { - "Name": "BaselineScoresFormula", - "Type": "String", - "Desc": "Freeform defining the scores that should be used as the baseline ranker", + "Name": "PrintTestGraph", + "Type": "Bool", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "basescores" + "graph" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "BaselineAlphaRisk", - "Type": "String", - "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", + "Name": "PrintTrainValidGraph", + "Type": "Bool", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "basealpha" + "graphtv" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "PositionDiscountFreeform", - "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Name": "TestFrequency", + "Type": "Int", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "pdff" + "tf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null - }, + "Default": 2147483647 + } + ], + "Outputs": [ { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRankingOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.FastTreeRegressor", + "Desc": "Trains gradient boosted decision trees to fit target values using least-squares.", + "FriendlyName": "FastTree (Boosted Trees) Regression", + "ShortName": "ftr", + "Inputs": [ + { + "Name": "NumTrees", + "Type": "Int", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ - "parag" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": { - "Name": "Single" + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 100, + 500 + ] } }, { - "Name": "NumThreads", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "NumLeaves", "Type": "Int", - "Desc": "The number of threads to use", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "t" + "nl" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 2.0, + "IsNullable": false, + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true + } }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "r1" + "feat" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 123 + "Default": "Features" }, { - "Name": "FeatureSelectSeed", + "Name": "MinDocumentsInLeafs", "Type": "Int", - "Desc": "The seed of the active feature selection", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "r3" + "mil" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "e" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 0.0 + "Default": "Label" }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", "Aliases": [ - "ps" + "lr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": -1 + "Default": 0.2, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 0.4, + "IsLogScale": true + } }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", "Aliases": [ - "dt" + "weight" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 4.0, + "IsNullable": false, + "Default": "Weight" }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", "Aliases": [ - "flocks" + "groupId" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": true + "Default": "GroupId" }, { - "Name": "CategoricalSplit", - "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "cat" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": false + "Default": "Auto" }, { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "mcg" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 64 + "Default": "Auto" }, { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Name": "BestStepRankingRegressionTrees", + "Type": "Bool", + "Desc": "Use best regression step trees?", "Aliases": [ - "maxcat" + "bsr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Name": "UseLineSearch", + "Type": "Bool", + "Desc": "Should we use line search for a step size", "Aliases": [ - "mdop" + "ls" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001 + "Default": false }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "NumPostBracketSteps", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Number of post-bracket line search steps", "Aliases": [ - "mdo" + "lssteps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 0 }, { - "Name": "Bias", + "Name": "MinStepSize", "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Desc": "Minimum line search step size", "Aliases": [ - "bias" + "minstep" ], "Required": false, "SortOrder": 150.0, @@ -7509,505 +7880,493 @@ "Default": 0.0 }, { - "Name": "Bundling", + "Name": "OptimizationAlgorithm", "Type": { "Kind": "Enum", "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" + "GradientDescent", + "AcceleratedGradientDescent", + "ConjugateGradientDescent" ] }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", "Aliases": [ - "bundle" + "oa" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "None" + "Default": "GradientDescent" }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", - "Aliases": [ - "mb" - ], - "Required": false, + "Name": "EarlyStoppingRule", + "Type": { + "Kind": "Component", + "ComponentKind": "EarlyStoppingCriterion" + }, + "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Aliases": [ + "esr" + ], + "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": null }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "EarlyStoppingMetrics", + "Type": "Int", + "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", "Aliases": [ - "sp" + "esmt" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 1 }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "EnablePruning", + "Type": "Bool", + "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", "Aliases": [ - "ffup" + "pruning" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Name": "UseTolerantPruning", + "Type": "Bool", + "Desc": "Use window and tolerance for pruning", "Aliases": [ - "frup" + "prtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "GainConfidenceLevel", + "Name": "PruningThreshold", "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Desc": "The tolerance threshold for pruning", "Aliases": [ - "gainconf" + "prth" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.004 }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "PruningWindowSize", + "Type": "Int", + "Desc": "The moving window size for pruning", "Aliases": [ - "smtemp" + "prws" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 5 }, { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Name": "Shrinkage", + "Type": "Float", + "Desc": "Shrinkage", "Aliases": [ - "et" + "shrk" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 4.0, + "IsLogScale": true + } }, { - "Name": "FeatureFraction", + "Name": "DropoutRate", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Desc": "Dropout rate for tree regularization", "Aliases": [ - "ff" + "tdrop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 1E-09, + 0.05, + 0.1, + 0.2 + ] + } }, { - "Name": "BaggingSize", + "Name": "GetDerivativesSampleRate", "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "bag" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", + "Name": "WriteLastEnsemble", + "Type": "Bool", + "Desc": "Write the last ensemble instead of the one determined by early stopping", "Aliases": [ - "bagfrac" + "hl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": false }, { - "Name": "SplitFraction", + "Name": "MaxTreeOutput", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "sf" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 100.0 }, { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", + "Name": "RandomStart", + "Type": "Bool", + "Desc": "Training starts from random ordering (determined by /r1)", "Aliases": [ - "s" + "rs" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "AllowEmptyTrees", + "Name": "FilterZeroLambdas", "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", + "Desc": "Filter zero lambdas during training", "Aliases": [ - "allowempty", - "dummies" + "fzl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", + "Name": "BaselineScoresFormula", + "Type": "String", + "Desc": "Freeform defining the scores that should be used as the baseline ranker", "Aliases": [ - "fcomp" + "basescores" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": null }, { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", + "Name": "BaselineAlphaRisk", + "Type": "String", + "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", "Aliases": [ - "cmp" + "basealpha" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": null }, { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", + "Name": "PositionDiscountFreeform", + "Type": "String", + "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ - "cmpmax" + "pdff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": null }, { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "graph" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "PrintTrainValidGraph", - "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "graphtv" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "TestFrequency", + "Name": "RngSeed", "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Desc": "The seed of the random number generator", "Aliases": [ - "tf" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRegressionOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.FastTreeTweedieRegressor", - "Desc": "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.", - "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", - "ShortName": "fttweedie", - "Inputs": [ + "Default": 123 + }, { - "Name": "NumTrees", + "Name": "FeatureSelectSeed", "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", + "Desc": "The seed of the active feature selection", "Aliases": [ - "iter" + "r3" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } + "Default": 123 }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "data" + "e" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "NumLeaves", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "nl" + "ps" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true - } + "Default": -1 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "feat" + "dt" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Aliases": [ + "flocks" + ], + "Required": false, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": true }, { - "Name": "MinDocumentsInLeafs", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", + "Aliases": [ + "cat" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "MaxCategoricalGroupsPerNode", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "mil" + "mcg" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "Default": 64 }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "lab" + "maxcat" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": 64 }, { - "Name": "LearningRates", + "Name": "MinDocsPercentageForCategoricalSplit", "Type": "Float", - "Desc": "The learning rate", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "lr" + "mdop" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.2, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 0.4, - "IsLogScale": true - } + "Default": 0.001 }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MinDocsForCategoricalSplit", + "Type": "Int", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "weight" + "mdo" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 100 }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "groupId" + "bias" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "GroupId" + "Default": 0.0 }, { - "Name": "NormalizeFeatures", + "Name": "Bundling", "Type": { "Kind": "Enum", "Values": [ - "No", - "Warn", - "Auto", - "Yes" + "None", + "AggregateLowPopulation", + "Adjacent" ] }, - "Desc": "Normalize option for the feature column", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "norm" + "bundle" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": "None" }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "cache" + "mb" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 255 }, { - "Name": "Index", + "Name": "SparsifyThreshold", "Type": "Float", - "Desc": "Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss.", + "Desc": "Sparsity level needed to use sparse feature representation", + "Aliases": [ + "sp" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.5 + "Default": 0.7 }, { - "Name": "BestStepRankingRegressionTrees", - "Type": "Bool", - "Desc": "Use best regression step trees?", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "bsr" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "UseLineSearch", - "Type": "Bool", - "Desc": "Should we use line search for a step size", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "ls" + "frup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "NumPostBracketSteps", - "Type": "Int", - "Desc": "Number of post-bracket line search steps", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "lssteps" + "gainconf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 0.0 }, { - "Name": "MinStepSize", + "Name": "SoftmaxTemperature", "Type": "Float", - "Desc": "Minimum line search step size", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "minstep" + "smtemp" ], "Required": false, "SortOrder": 150.0, @@ -8015,45 +8374,35 @@ "Default": 0.0 }, { - "Name": "OptimizationAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "GradientDescent", - "AcceleratedGradientDescent", - "ConjugateGradientDescent" - ] - }, - "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "oa" + "et" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "GradientDescent" + "Default": false }, { - "Name": "EarlyStoppingRule", - "Type": { - "Kind": "Component", - "ComponentKind": "EarlyStoppingCriterion" - }, - "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Name": "FeatureFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "esr" + "ff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 1.0 }, { - "Name": "EarlyStoppingMetrics", + "Name": "BaggingSize", "Type": "Int", - "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "esmt" + "bag" ], "Required": false, "SortOrder": 150.0, @@ -8061,99 +8410,60 @@ "Default": 0 }, { - "Name": "EnablePruning", - "Type": "Bool", - "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", + "Name": "BaggingTrainFraction", + "Type": "Float", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "pruning" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "UseTolerantPruning", - "Type": "Bool", - "Desc": "Use window and tolerance for pruning", - "Aliases": [ - "prtol" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.7 }, { - "Name": "PruningThreshold", + "Name": "SplitFraction", "Type": "Float", - "Desc": "The tolerance threshold for pruning", - "Aliases": [ - "prth" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.004 - }, - { - "Name": "PruningWindowSize", - "Type": "Int", - "Desc": "The moving window size for pruning", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "prws" + "sf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 5 + "Default": 1.0 }, { - "Name": "Shrinkage", + "Name": "Smoothing", "Type": "Float", - "Desc": "Shrinkage", + "Desc": "Smoothing paramter for tree regularization", "Aliases": [ - "shrk" + "s" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 4.0, - "IsLogScale": true - } + "Default": 0.0 }, { - "Name": "DropoutRate", - "Type": "Float", - "Desc": "Dropout rate for tree regularization", + "Name": "AllowEmptyTrees", + "Type": "Bool", + "Desc": "When a root split is impossible, allow training to proceed", "Aliases": [ - "tdrop" + "allowempty", + "dummies" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 1E-09, - 0.05, - 0.1, - 0.2 - ] - } + "Default": true }, { - "Name": "GetDerivativesSampleRate", + "Name": "FeatureCompressionLevel", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "The level of feature compression to use", "Aliases": [ - "sr" + "fcomp" ], "Required": false, "SortOrder": 150.0, @@ -8161,11 +8471,11 @@ "Default": 1 }, { - "Name": "WriteLastEnsemble", + "Name": "CompressEnsemble", "Type": "Bool", - "Desc": "Write the last ensemble instead of the one determined by early stopping", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "hl" + "cmp" ], "Required": false, "SortOrder": 150.0, @@ -8173,23 +8483,23 @@ "Default": false }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "MaxTreesAfterCompression", + "Type": "Int", + "Desc": "Maximum Number of trees after compression", "Aliases": [ - "mo" + "cmpmax" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100.0 + "Default": -1 }, { - "Name": "RandomStart", + "Name": "PrintTestGraph", "Type": "Bool", - "Desc": "Training starts from random ordering (determined by /r1)", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "rs" + "graph" ], "Required": false, "SortOrder": 150.0, @@ -8197,11 +8507,11 @@ "Default": false }, { - "Name": "FilterZeroLambdas", + "Name": "PrintTrainValidGraph", "Type": "Bool", - "Desc": "Filter zero lambdas during training", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "fzl" + "graphtv" ], "Required": false, "SortOrder": 150.0, @@ -8209,208 +8519,269 @@ "Default": false }, { - "Name": "BaselineScoresFormula", - "Type": "String", - "Desc": "Freeform defining the scores that should be used as the baseline ranker", + "Name": "TestFrequency", + "Type": "Int", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "basescores" + "tf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null - }, + "Default": 2147483647 + } + ], + "Outputs": [ { - "Name": "BaselineAlphaRisk", - "Type": "String", - "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.FastTreeTweedieRegressor", + "Desc": "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.", + "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", + "ShortName": "fttweedie", + "Inputs": [ + { + "Name": "NumTrees", + "Type": "Int", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ - "basealpha" + "iter" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": null + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 100, + 500 + ] + } }, { - "Name": "PositionDiscountFreeform", - "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "pdff" + "data" ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "NumLeaves", + "Type": "Int", + "Desc": "The max number of leaves in each regression tree", "Aliases": [ - "parag" + "nl" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": { - "Name": "Single" + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true } }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "t" + "feat" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" }, { - "Name": "RngSeed", + "Name": "MinDocumentsInLeafs", "Type": "Int", - "Desc": "The seed of the random number generator", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "r1" + "mil" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "r3" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": "Label" }, { - "Name": "EntropyCoefficient", + "Name": "LearningRates", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "The learning rate", "Aliases": [ - "e" + "lr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Default": 0.2, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 0.4, + "IsLogScale": true + } + }, + { + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", "Aliases": [ - "ps" + "weight" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": -1 + "Default": "Weight" }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", "Aliases": [ - "dt" + "groupId" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 5.0, + "IsNullable": false, + "Default": "GroupId" }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "flocks" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": true + "Default": "Auto" }, { - "Name": "CategoricalSplit", - "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "cat" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": false + "Default": "Auto" }, { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", - "Aliases": [ - "mcg" - ], + "Name": "Index", + "Type": "Float", + "Desc": "Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss.", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": 1.5 }, { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Name": "BestStepRankingRegressionTrees", + "Type": "Bool", + "Desc": "Use best regression step trees?", "Aliases": [ - "maxcat" + "bsr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 64 + "Default": false }, { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", + "Name": "UseLineSearch", + "Type": "Bool", + "Desc": "Should we use line search for a step size", "Aliases": [ - "mdop" + "ls" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001 + "Default": false }, { - "Name": "MinDocsForCategoricalSplit", + "Name": "NumPostBracketSteps", "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Desc": "Number of post-bracket line search steps", "Aliases": [ - "mdo" + "lssteps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100 + "Default": 0 }, { - "Name": "Bias", + "Name": "MinStepSize", "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Desc": "Minimum line search step size", "Aliases": [ - "bias" + "minstep" ], "Required": false, "SortOrder": 150.0, @@ -8418,464 +8789,469 @@ "Default": 0.0 }, { - "Name": "Bundling", + "Name": "OptimizationAlgorithm", "Type": { "Kind": "Enum", "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" + "GradientDescent", + "AcceleratedGradientDescent", + "ConjugateGradientDescent" ] }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", "Aliases": [ - "bundle" + "oa" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "None" + "Default": "GradientDescent" }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "EarlyStoppingRule", + "Type": { + "Kind": "Component", + "ComponentKind": "EarlyStoppingCriterion" + }, + "Desc": "Early stopping rule. (Validation set (/valid) is required.)", "Aliases": [ - "mb" + "esr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": null }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "EarlyStoppingMetrics", + "Type": "Int", + "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", "Aliases": [ - "sp" + "esmt" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": 0 }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "EnablePruning", + "Type": "Bool", + "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", "Aliases": [ - "ffup" + "pruning" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Name": "UseTolerantPruning", + "Type": "Bool", + "Desc": "Use window and tolerance for pruning", "Aliases": [ - "frup" + "prtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "GainConfidenceLevel", + "Name": "PruningThreshold", "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Desc": "The tolerance threshold for pruning", "Aliases": [ - "gainconf" + "prth" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.004 }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "PruningWindowSize", + "Type": "Int", + "Desc": "The moving window size for pruning", "Aliases": [ - "smtemp" + "prws" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 5 }, { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Name": "Shrinkage", + "Type": "Float", + "Desc": "Shrinkage", "Aliases": [ - "et" + "shrk" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 4.0, + "IsLogScale": true + } }, { - "Name": "FeatureFraction", + "Name": "DropoutRate", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Desc": "Dropout rate for tree regularization", "Aliases": [ - "ff" + "tdrop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 1E-09, + 0.05, + 0.1, + 0.2 + ] + } }, { - "Name": "BaggingSize", + "Name": "GetDerivativesSampleRate", "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "bag" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", + "Name": "WriteLastEnsemble", + "Type": "Bool", + "Desc": "Write the last ensemble instead of the one determined by early stopping", "Aliases": [ - "bagfrac" + "hl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.7 + "Default": false }, { - "Name": "SplitFraction", + "Name": "MaxTreeOutput", "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "sf" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": 100.0 }, { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", - "Aliases": [ - "s" + "Name": "RandomStart", + "Type": "Bool", + "Desc": "Training starts from random ordering (determined by /r1)", + "Aliases": [ + "rs" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "AllowEmptyTrees", + "Name": "FilterZeroLambdas", "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", + "Desc": "Filter zero lambdas during training", "Aliases": [ - "allowempty", - "dummies" + "fzl" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", + "Name": "BaselineScoresFormula", + "Type": "String", + "Desc": "Freeform defining the scores that should be used as the baseline ranker", "Aliases": [ - "fcomp" + "basescores" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": null }, { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", + "Name": "BaselineAlphaRisk", + "Type": "String", + "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", "Aliases": [ - "cmp" + "basealpha" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": null }, { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", + "Name": "PositionDiscountFreeform", + "Type": "String", + "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ - "cmpmax" + "pdff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": null }, { - "Name": "PrintTestGraph", - "Type": "Bool", - "Desc": "Print metrics graph for the first test set", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "graph" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "PrintTrainValidGraph", - "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "graphtv" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "TestFrequency", + "Name": "RngSeed", "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", + "Desc": "The seed of the random number generator", "Aliases": [ - "tf" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRegressionOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", - "FriendlyName": "Generalized Additive Model for Binary Classification", - "ShortName": "gam", - "Inputs": [ + "Default": 123 + }, { - "Name": "NumIterations", + "Name": "FeatureSelectSeed", "Type": "Int", - "Desc": "Total number of iterations over all features", + "Desc": "The seed of the active feature selection", "Aliases": [ - "iter" + "r3" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 9500, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 200, - 1500, - 9500 - ] - } + "Default": 123 }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "data" + "e" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "HistogramPoolSize", + "Type": "Int", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "feat" + "ps" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": -1 }, { - "Name": "MinDocuments", - "Type": "Int", - "Desc": "Minimum number of training instances required to form a partition", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "mi" + "dt" ], "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "lab" + "flocks" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": true }, { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "lr" + "cat" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.002, - "SweepRange": { - "RangeType": "Float", - "Min": 0.001, - "Max": 0.1, - "IsLogScale": true - } + "Default": false }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MaxCategoricalGroupsPerNode", + "Type": "Int", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "weight" + "mcg" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 64 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "norm" + "maxcat" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 64 }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "MinDocsPercentageForCategoricalSplit", + "Type": "Float", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "cache" + "mdop" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.001 }, { - "Name": "UnbalancedSets", - "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", + "Name": "MinDocsForCategoricalSplit", + "Type": "Int", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "us" + "mdo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 100 }, { - "Name": "Calibrator", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Aliases": [ + "bias" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "Bundling", "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" + "Kind": "Enum", + "Values": [ + "None", + "AggregateLowPopulation", + "Adjacent" + ] }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Aliases": [ + "bundle" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": "None" }, { - "Name": "MaxCalibrationExamples", + "Name": "MaxBins", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "Maximum number of distinct values (bins) per feature", + "Aliases": [ + "mb" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": 255 }, { - "Name": "EntropyCoefficient", + "Name": "SparsifyThreshold", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "e" + "sp" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.7 + }, + { + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", + "Aliases": [ + "ffup" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", + "Aliases": [ + "frup" ], "Required": false, "SortOrder": 150.0, @@ -8884,7 +9260,7 @@ }, { "Name": "GainConfidenceLevel", - "Type": "Int", + "Type": "Float", "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ "gainconf" @@ -8892,62 +9268,111 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 0.0 }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "t" + "smtemp" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0.0 }, { - "Name": "DiskTranspose", + "Name": "ExecutionTimes", "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "dt" + "et" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": false }, { - "Name": "MaxBins", + "Name": "FeatureFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Aliases": [ + "ff" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "BaggingSize", "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "mb" + "bag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": 0 }, { - "Name": "MaxOutput", + "Name": "BaggingTrainFraction", "Type": "Float", - "Desc": "Upper bound on absolute value of single output", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "mo" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Infinity" + "Default": 0.7 }, { - "Name": "GetDerivativesSampleRate", + "Name": "SplitFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each split", + "Aliases": [ + "sf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "Smoothing", + "Type": "Float", + "Desc": "Smoothing paramter for tree regularization", + "Aliases": [ + "s" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "AllowEmptyTrees", + "Type": "Bool", + "Desc": "When a root split is impossible, allow training to proceed", + "Aliases": [ + "allowempty", + "dummies" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "FeatureCompressionLevel", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "The level of feature compression to use", "Aliases": [ - "sr" + "fcomp" ], "Required": false, "SortOrder": 150.0, @@ -8955,28 +9380,64 @@ "Default": 1 }, { - "Name": "RngSeed", + "Name": "CompressEnsemble", + "Type": "Bool", + "Desc": "Compress the tree Ensemble", + "Aliases": [ + "cmp" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "MaxTreesAfterCompression", "Type": "Int", - "Desc": "The seed of the random number generator", + "Desc": "Maximum Number of trees after compression", "Aliases": [ - "r1" + "cmpmax" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": -1 }, { - "Name": "FeatureFlocks", + "Name": "PrintTestGraph", "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "flocks" + "graph" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false + }, + { + "Name": "PrintTrainValidGraph", + "Type": "Bool", + "Desc": "Print Train and Validation metrics in graph", + "Aliases": [ + "graphtv" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "TestFrequency", + "Type": "Int", + "Desc": "Calculate metric values for train/valid/test every k rounds", + "Aliases": [ + "tf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 2147483647 } ], "Outputs": [ @@ -8987,20 +9448,21 @@ } ], "InputKind": [ + "ITrainerInputWithGroupId", "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.GeneralizedAdditiveModelRegressor", + "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", - "FriendlyName": "Generalized Additive Model for Regression", - "ShortName": "gamr", + "FriendlyName": "Generalized Additive Model for Binary Classification", + "ShortName": "gam", "Inputs": [ { "Name": "NumIterations", @@ -9148,24 +9610,59 @@ "Default": "Auto" }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "UnbalancedSets", + "Type": "Bool", + "Desc": "Should we use derivatives optimized for unbalanced sets", "Aliases": [ - "e" + "us" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": false }, { - "Name": "GainConfidenceLevel", - "Type": "Int", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", - "Aliases": [ - "gainconf" - ], + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 + }, + { + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Aliases": [ + "e" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "GainConfidenceLevel", + "Type": "Int", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Aliases": [ + "gainconf" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -9269,16 +9766,36 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.KMeansPlusPlusClusterer", - "Desc": "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.", - "FriendlyName": "KMeans++ Clustering", - "ShortName": "KM", + "Name": "Trainers.GeneralizedAdditiveModelRegressor", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "FriendlyName": "Generalized Additive Model for Regression", + "ShortName": "gamr", "Inputs": [ + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Total number of iterations over all features", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9500, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 200, + 1500, + 9500 + ] + } + }, { "Name": "TrainingData", "Type": "DataView", @@ -9302,6 +9819,56 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "MinDocuments", + "Type": "Int", + "Desc": "Minimum number of training instances required to form a partition", + "Aliases": [ + "mi" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", + "Aliases": [ + "lr" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 0.002, + "SweepRange": { + "RangeType": "Float", + "Min": 0.001, + "Max": 0.1, + "IsLogScale": true + } + }, { "Name": "WeightColumn", "Type": "String", @@ -9355,91 +9922,112 @@ "Default": "Auto" }, { - "Name": "K", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Aliases": [ + "e" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "GainConfidenceLevel", "Type": "Int", - "Desc": "The number of clusters", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Aliases": [ + "gainconf" + ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 5, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 10, - 20, - 40 - ] - } + "Default": 0 }, { "Name": "NumThreads", "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Desc": "The number of threads to use", "Aliases": [ - "nt", - "t", - "threads" + "t" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "InitAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "KMeansPlusPlus", - "Random", - "KMeansParallel" - ] - }, - "Desc": "Cluster initialization algorithm", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "init" + "dt" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", + "Aliases": [ + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "KMeansParallel" + "Default": 255 }, { - "Name": "OptTol", + "Name": "MaxOutput", "Type": "Float", - "Desc": "Tolerance parameter for trainer convergence. Lower = slower, more accurate", + "Desc": "Upper bound on absolute value of single output", "Aliases": [ - "ot" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1E-07 + "Default": "Infinity" }, { - "Name": "MaxIterations", + "Name": "GetDerivativesSampleRate", "Type": "Int", - "Desc": "Maximum number of iterations.", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "maxiter" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000 + "Default": 1 }, { - "Name": "AccelMemBudgetMb", + "Name": "RngSeed", "Type": "Int", - "Desc": "Memory budget (in MBs) to use for KMeans acceleration", + "Desc": "The seed of the random number generator", "Aliases": [ - "accelMemBudgetMb" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 4096 + "Default": 123 + }, + { + "Name": "FeatureFlocks", + "Type": "Bool", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Aliases": [ + "flocks" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -9450,19 +10038,20 @@ } ], "InputKind": [ - "IUnsupervisedTrainerWithWeight", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IClusteringOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.LinearSvmBinaryClassifier", - "Desc": "Train a linear SVM.", - "FriendlyName": "SVM (Pegasos-Linear)", - "ShortName": "svm", + "Name": "Trainers.KMeansPlusPlusClusterer", + "Desc": "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.", + "FriendlyName": "KMeans++ Clustering", + "ShortName": "KM", "Inputs": [ { "Name": "TrainingData", @@ -9488,16 +10077,16 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "WeightColumn", "Type": "String", - "Desc": "Column to use for labels", + "Desc": "Column to use for example weight", "Aliases": [ - "lab" + "weight" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": "Label" + "Default": "Weight" }, { "Name": "NormalizeFeatures", @@ -9540,13 +10129,198 @@ "Default": "Auto" }, { - "Name": "Lambda", - "Type": "Float", - "Desc": "Regularizer constant", - "Aliases": [ - "lambda" - ], - "Required": false, + "Name": "K", + "Type": "Int", + "Desc": "The number of clusters", + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 5, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 5, + 10, + 20, + 40 + ] + } + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Aliases": [ + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "InitAlgorithm", + "Type": { + "Kind": "Enum", + "Values": [ + "KMeansPlusPlus", + "Random", + "KMeansParallel" + ] + }, + "Desc": "Cluster initialization algorithm", + "Aliases": [ + "init" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "KMeansParallel" + }, + { + "Name": "OptTol", + "Type": "Float", + "Desc": "Tolerance parameter for trainer convergence. Lower = slower, more accurate", + "Aliases": [ + "ot" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1E-07 + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations.", + "Aliases": [ + "maxiter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000 + }, + { + "Name": "AccelMemBudgetMb", + "Type": "Int", + "Desc": "Memory budget (in MBs) to use for KMeans acceleration", + "Aliases": [ + "accelMemBudgetMb" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 4096 + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "IUnsupervisedTrainerWithWeight", + "ITrainerInput" + ], + "OutputKind": [ + "IClusteringOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.LinearSvmBinaryClassifier", + "Desc": "Train a linear SVM.", + "FriendlyName": "SVM (Pegasos-Linear)", + "ShortName": "svm", + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Lambda", + "Type": "Float", + "Desc": "Regularizer constant", + "Aliases": [ + "lambda" + ], + "Required": false, "SortOrder": 50.0, "IsNullable": false, "Default": 0.001, @@ -11239,35 +12013,11 @@ ] }, { - "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", - "Desc": "Train an SDCA binary model.", - "FriendlyName": "Fast Linear (SA-SDCA)", - "ShortName": "SDCA", + "Name": "Trainers.RegressionEnsemble", + "Desc": "Train regression ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, "Inputs": [ - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -11280,26 +12030,25 @@ "IsNullable": false }, { - "Name": "L1Threshold", - "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", "Aliases": [ - "l1" + "st" ], "Required": false, "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 - ] + "IsNullable": false, + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { @@ -11314,6 +12063,18 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "NumModels", + "Type": "Int", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", + "Aliases": [ + "nm" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": true, + "Default": null + }, { "Name": "LabelColumn", "Type": "String", @@ -11327,9 +12088,224 @@ "Default": "Label" }, { - "Name": "NormalizeFeatures", + "Name": "SubModelSelectorType", "Type": { - "Kind": "Enum", + "Kind": "Component", + "ComponentKind": "EnsembleRegressionSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", + "Aliases": [ + "pt" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": { + "Name": "AllSelector" + } + }, + { + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionOutputCombiner" + }, + "Desc": "Output combiner", + "Aliases": [ + "oc" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": { + "Name": "Median" + } + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "TrainParallel", + "Type": "Bool", + "Desc": "All the base learners will run asynchronously if the value is true", + "Aliases": [ + "tp" + ], + "Required": false, + "SortOrder": 106.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", + "Aliases": [ + "bs" + ], + "Required": false, + "SortOrder": 107.0, + "IsNullable": false, + "Default": -1 + }, + { + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", + "Aliases": [ + "sm" + ], + "Required": false, + "SortOrder": 108.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", + "Desc": "Train an SDCA binary model.", + "FriendlyName": "Fast Linear (SA-SDCA)", + "ShortName": "SDCA", + "Inputs": [ + { + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } + }, + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", + "Aliases": [ + "l1" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 + ] + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", "Values": [ "No", "Warn", @@ -18102,281 +19078,967 @@ ], "Settings": [ { - "Name": "Slope", + "Name": "Slope", + "Type": "Float", + "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Aliases": [ + "a" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "Offset", + "Type": "Float", + "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Aliases": [ + "b" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + } + ] + }, + { + "Name": "NaiveCalibrator", + "Desc": null, + "FriendlyName": "Naive Calibrator", + "Aliases": [ + "Naive" + ], + "Settings": [] + }, + { + "Name": "PavCalibrator", + "Desc": null, + "FriendlyName": "PAV Calibrator", + "Aliases": [ + "Pav" + ], + "Settings": [] + }, + { + "Name": "PlattCalibrator", + "Desc": "Platt calibration.", + "FriendlyName": "Platt Calibrator", + "Aliases": [ + "Platt", + "Sigmoid" + ], + "Settings": [] + } + ] + }, + { + "Kind": "ClassificationLossFunction", + "Components": [ + { + "Name": "ExpLoss", + "Desc": "Exponential loss.", + "FriendlyName": "Exponential Loss", + "Settings": [ + { + "Name": "Beta", + "Type": "Float", + "Desc": "Beta (dilation)", + "Aliases": [ + "beta" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + }, + { + "Name": "HingeLoss", + "Desc": "Hinge loss.", + "FriendlyName": "Hinge loss", + "Aliases": [ + "Hinge" + ], + "Settings": [ + { + "Name": "Margin", + "Type": "Float", + "Desc": "Margin value", + "Aliases": [ + "marg" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + }, + { + "Name": "LogLoss", + "Desc": "Log loss.", + "FriendlyName": "Log loss", + "Aliases": [ + "Logistic", + "CrossEntropy" + ], + "Settings": [] + }, + { + "Name": "SmoothedHingeLoss", + "Desc": "Smoothed Hinge loss.", + "FriendlyName": "Smoothed Hinge Loss", + "Aliases": [ + "SmoothedHinge" + ], + "Settings": [ + { + "Name": "SmoothingConst", + "Type": "Float", + "Desc": "Smoothing constant", + "Aliases": [ + "smooth" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + } + ] + }, + { + "Kind": "EarlyStoppingCriterion", + "Components": [ + { + "Name": "GL", + "Desc": "Stop in case of loss of generality.", + "FriendlyName": "Loss of Generality (GL)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + } + ] + }, + { + "Name": "LP", + "Desc": "Stops in case of low progress.", + "FriendlyName": "Low Progress (LP)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + }, + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + }, + { + "Name": "PQ", + "Desc": "Stops in case of generality to progress ration exceeds threshold.", + "FriendlyName": "Generality to Progress Ratio (PQ)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + }, + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + }, + { + "Name": "TR", + "Desc": "Stop if validation score exceeds threshold value.", + "FriendlyName": "Tolerant (TR)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Tolerance threshold. (Non negative value)", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Min": 0.0 + } + } + ] + }, + { + "Name": "UP", + "Desc": "Stops in case of consecutive loss in generality.", + "FriendlyName": "Consecutive Loss in Generality (UP)", + "Settings": [ + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + } + ] + }, + { + "Kind": "EnsembleBinaryOutputCombiner", + "Components": [ + { + "Name": "Average", + "Desc": null, + "FriendlyName": "Average", + "Settings": [] + }, + { + "Name": "Median", + "Desc": null, + "FriendlyName": "Median", + "Settings": [] + }, + { + "Name": "Stacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "Voting", + "Desc": null, + "FriendlyName": "Voting", + "Settings": [] + }, + { + "Name": "WeightedAverage", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "WeightageName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "Auc", + "PosPrecision", + "PosRecall", + "NegPrecision", + "NegRecall" + ] + }, + "Desc": "The metric type to be used to find the weights for each model", + "Aliases": [ + "wn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + } + ] + } + ] + }, + { + "Kind": "EnsembleBinarySubModelSelector", + "Components": [ + { + "Name": "AllSelector", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelector", + "Desc": null, + "FriendlyName": "Best Diverse Selector", + "Settings": [ + { + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", + "Aliases": [ + "dm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", + "Aliases": [ + "lp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "BestPerformanceSelector", + "Desc": null, + "FriendlyName": "Best Performance Selector", + "Settings": [ + { + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "PosPrecName", + "PosRecallName", + "NegPrecName", + "NegRecallName", + "Auc", + "LogLoss", + "LogLossReduction", + "F1", + "AuPrc" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + }, + { + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", + "Aliases": [ + "lp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + } + ] + }, + { + "Kind": "EnsembleDiversityMeasure", + "Components": [ + { + "Name": "DisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] + }, + { + "Name": "MultiDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] + }, + { + "Name": "RegressionDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] + } + ] + }, + { + "Kind": "EnsembleFeatureSelector", + "Components": [ + { + "Name": "AllFeatureSelector", + "Desc": null, + "FriendlyName": "All Feature Selector", + "Settings": [] + }, + { + "Name": "RandomFeatureSelector", + "Desc": null, + "FriendlyName": "Random Feature Selector", + "Settings": [ + { + "Name": "FeaturesSelectionProportion", + "Type": "Float", + "Desc": "The proportion of features to be selected. The range is 0.0-1.0", + "Aliases": [ + "fp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.8 + } + ] + } + ] + }, + { + "Kind": "EnsembleMulticlassOutputCombiner", + "Components": [ + { + "Name": "MultiAverage", + "Desc": null, + "FriendlyName": "Average", + "Settings": [ + { + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true + } + ] + }, + { + "Name": "MultiMedian", + "Desc": null, + "FriendlyName": "Median", + "Settings": [ + { + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true + } + ] + }, + { + "Name": "MultiStacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "MultiVoting", + "Desc": null, + "FriendlyName": "Voting", + "Settings": [ + { + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true + } + ] + }, + { + "Name": "MultiWeightedAverage", + "Desc": null, + "FriendlyName": "Multi Weighted Average", + "Settings": [ + { + "Name": "WeightageName", + "Type": { + "Kind": "Enum", + "Values": [ + "AccuracyMicroAvg", + "AccuracyMacroAvg" + ] + }, + "Desc": "The metric type to be used to find the weights for each model", + "Aliases": [ + "wn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "AccuracyMicroAvg" + }, + { + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": true + } + ] + } + ] + }, + { + "Kind": "EnsembleMulticlassSubModelSelector", + "Components": [ + { + "Name": "AllSelectorMultiClass", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelectorMultiClass", + "Desc": null, + "FriendlyName": "Best Diverse Selector", + "Settings": [ + { + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", + "Aliases": [ + "dm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "a" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.5 }, { - "Name": "Offset", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "b" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.3 } ] }, { - "Name": "NaiveCalibrator", - "Desc": null, - "FriendlyName": "Naive Calibrator", - "Aliases": [ - "Naive" - ], - "Settings": [] - }, - { - "Name": "PavCalibrator", + "Name": "BestPerformanceSelectorMultiClass", "Desc": null, - "FriendlyName": "PAV Calibrator", - "Aliases": [ - "Pav" - ], - "Settings": [] - }, - { - "Name": "PlattCalibrator", - "Desc": "Platt calibration.", - "FriendlyName": "Platt Calibrator", - "Aliases": [ - "Platt", - "Sigmoid" - ], - "Settings": [] - } - ] - }, - { - "Kind": "ClassificationLossFunction", - "Components": [ - { - "Name": "ExpLoss", - "Desc": "Exponential loss.", - "FriendlyName": "Exponential Loss", + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "Beta", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "AccuracyMicro", + "AccuracyMacro", + "LogLoss", + "LogLossReduction" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "AccuracyMicro" + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Beta (dilation)", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "beta" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 - } - ] - }, - { - "Name": "HingeLoss", - "Desc": "Hinge loss.", - "FriendlyName": "Hinge loss", - "Aliases": [ - "Hinge" - ], - "Settings": [ + "Default": 0.5 + }, { - "Name": "Margin", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Margin value", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "marg" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.3 } ] + } + ] + }, + { + "Kind": "EnsembleRegressionOutputCombiner", + "Components": [ + { + "Name": "Average", + "Desc": null, + "FriendlyName": "Average", + "Settings": [] }, { - "Name": "LogLoss", - "Desc": "Log loss.", - "FriendlyName": "Log loss", - "Aliases": [ - "Logistic", - "CrossEntropy" - ], + "Name": "Median", + "Desc": null, + "FriendlyName": "Median", "Settings": [] }, { - "Name": "SmoothedHingeLoss", - "Desc": "Smoothed Hinge loss.", - "FriendlyName": "Smoothed Hinge Loss", - "Aliases": [ - "SmoothedHinge" - ], + "Name": "RegressionStacking", + "Desc": null, + "FriendlyName": "Stacking", "Settings": [ { - "Name": "SmoothingConst", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Smoothing constant", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "smooth" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.3 } ] } ] }, { - "Kind": "EarlyStoppingCriterion", + "Kind": "EnsembleRegressionSubModelSelector", "Components": [ { - "Name": "GL", - "Desc": "Stop in case of loss of generality.", - "FriendlyName": "Loss of Generality (GL)", + "Name": "AllSelector", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelectorRegression", + "Desc": null, + "FriendlyName": "Best Diverse Selector", "Settings": [ { - "Name": "Threshold", - "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", "Aliases": [ - "th" + "dm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } - } - ] - }, - { - "Name": "LP", - "Desc": "Stops in case of low progress.", - "FriendlyName": "Low Progress (LP)", - "Settings": [ + "Default": null + }, { - "Name": "Threshold", + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "th" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } + "Default": 0.5 }, { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "w" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 - } + "Default": 0.3 } ] }, { - "Name": "PQ", - "Desc": "Stops in case of generality to progress ration exceeds threshold.", - "FriendlyName": "Generality to Progress Ratio (PQ)", + "Name": "BestPerformanceRegressionSelector", + "Desc": null, + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "Threshold", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "L1", + "L2", + "Rms", + "Loss", + "RSquared" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "L1" + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "th" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } + "Default": 0.5 }, { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "w" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 + "Default": 0.3 + } + ] + } + ] + }, + { + "Kind": "EnsembleSubsetSelector", + "Components": [ + { + "Name": "AllInstanceSelector", + "Desc": null, + "FriendlyName": "All Instance Selector", + "Settings": [ + { + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", + "Aliases": [ + "fs" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": { + "Name": "AllFeatureSelector" } } ] }, { - "Name": "TR", - "Desc": "Stop if validation score exceeds threshold value.", - "FriendlyName": "Tolerant (TR)", + "Name": "BootstrapSelector", + "Desc": null, + "FriendlyName": "Bootstrap Selector", "Settings": [ { - "Name": "Threshold", - "Type": "Float", - "Desc": "Tolerance threshold. (Non negative value)", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "th" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Min": 0.0 + "Default": { + "Name": "AllFeatureSelector" } } ] }, { - "Name": "UP", - "Desc": "Stops in case of consecutive loss in generality.", - "FriendlyName": "Consecutive Loss in Generality (UP)", + "Name": "RandomPartitionSelector", + "Desc": null, + "FriendlyName": "Random Partition Selector", "Settings": [ { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "w" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 + "Default": { + "Name": "AllFeatureSelector" } } ] From a2630a2bc42243f84905827f63a90483efc1d2ab Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 19 Jun 2018 16:24:37 -0700 Subject: [PATCH 08/20] remove predictor tests --- .../Microsoft.ML.Predictor.Tests.csproj | 1 - .../TestPredictors.cs | 75 +------------------ test/Microsoft.ML.TestFramework/Datasets.cs | 7 ++ 3 files changed, 11 insertions(+), 72 deletions(-) diff --git a/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj b/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj index a8955ede87..2329491e08 100644 --- a/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj +++ b/test/Microsoft.ML.Predictor.Tests/Microsoft.ML.Predictor.Tests.csproj @@ -12,7 +12,6 @@ - diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index 3bd50440ff..e3aaa9d851 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -50,7 +50,7 @@ public IList GetDatasetsForMulticlassClassifierTest() /// public IList GetDatasetsForRegressorTest() { - return new[] { TestDatasets.winequality }; + return new[] { TestDatasets.housing }; } /// @@ -355,70 +355,6 @@ public void FastForestRegressionTest() Done(); } - [Fact] - [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] - public void RegressorEnsembleTest() - { - IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegression }; - IList regressionDatasets = GetDatasetsForRegressorTest(); - RunAllTests(regressionPredictors, regressionDatasets); - Done(); - } - - [Fact] - [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] - public void RegressorEnsembleNumModelsTest() - { - IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionNumModels }; - IList regressionDatasets = GetDatasetsForRegressorTest(); - RunAllTests(regressionPredictors, regressionDatasets); - Done(); - } - - [Fact] - [ TestCategory("Regressor"), TestCategory("Ensemble - Regression")] - public void RegressorEnsembleAverageCombinerTest() - { - IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionAverageCombiner, }; - IList regressionDatasets = GetDatasetsForRegressorTest(); - RunAllTests(regressionPredictors, regressionDatasets); - Done(); - } - - [Fact] - [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] - public void RegressorEnsembleDiverseSelectorTest() - { - IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionDiverseSelector }; - IList regressionDatasets = GetDatasetsForRegressorTest(); - RunAllTests(regressionPredictors, regressionDatasets); - Done(); - } - - [Fact] - [TestCategory("Regressor"), TestCategory("Ensemble - Regression")] - public void RegressorEnsemblePerformanceSelectorTest() - { - IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionPerformanceSelector }; - IList regressionDatasets = GetDatasetsForRegressorTest(); - RunAllTests(regressionPredictors, regressionDatasets); - Done(); - } - - [Fact] - [ TestCategory("Regressor"), TestCategory("Ensemble - Regression"), TestCategory("FastTree")] - public void RegressorEnsembleStackingCombinerTest() - { - IList regressionPredictors = new PredictorAndArgs[] { TestLearners.EnsembleRegressionStackingCombiner }; - IList regressionDatasets = GetDatasetsForRegressorTest(); - RunMTAThread(() => - { - // Default is a FastTree learner, so we have to push it into an MTA thread. - RunAllTests(regressionPredictors, regressionDatasets); - }); - Done(); - } - [Fact(Skip = "Need CoreTLC specific baseline update")] [TestCategory("Weighting Predictors")] [TestCategory("FastForest")] @@ -760,7 +696,7 @@ public void FastTreeUnderbuiltRegressionTest() // case where the number of actual leaves is less than the number of maximum leaves per tree. RunMTAThread(() => { - Run_TrainTest(TestLearners.FastTreeUnderbuiltRegressor, TestDatasets.winequality, null, "Underbuilt"); + Run_TrainTest(TestLearners.FastTreeUnderbuiltRegressor, TestDatasets.housing, null, "Underbuilt"); }); Done(); } @@ -828,7 +764,7 @@ public void RegressorOlsTest() [TestCategory("Regressor")] public void RegressorOlsTestOne() { - Run_TrainTest(TestLearners.Ols, TestDatasets.winequality); + Run_TrainTest(TestLearners.Ols, TestDatasets.housing); Done(); } @@ -1884,7 +1820,4 @@ public void TestFeatureHandlerModelReuse() } } #endif - - - -} +} \ No newline at end of file diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 161fd28881..22cf12a002 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -145,6 +145,13 @@ public static class TestDatasets testFilename = "vw.dat" }; + public static TestDataset housing = new TestDataset + { + name = "housing", + trainFilename = "housing.txt", + testFilename = "housing.txt" + }; + public static TestDataset winequality = new TestDataset { name = "wine", From 9fa08c147b31b10baee9910bee3ee53acccb9219 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 19 Jun 2018 16:38:05 -0700 Subject: [PATCH 09/20] fix tests input loader --- test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 3134fe43aa..a16c962b5c 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -154,7 +154,7 @@ public void EntryPointFeatureCombiner() [Fact] public void EntryPointScoring() { - var dataView = GetBreastCancerDataView(); + var dataView = GetBreastCancerDataviewWithTextColumns(); dataView = Env.CreateTransform("Term{col=F1}", dataView); var trainData = FeatureCombiner.PrepareFeatures(Env, new FeatureCombiner.FeatureCombinerInput() { Data = dataView, Features = new[] { "F1", "F2", "Rest" } }); var lrModel = LogisticRegression.TrainBinary(Env, new LogisticRegression.Arguments { TrainingData = trainData.OutputData }).PredictorModel; @@ -174,7 +174,7 @@ public void EntryPointScoring() [Fact] public void EntryPointApplyModel() { - var dataView = GetBreastCancerDataView(); + var dataView = GetBreastCancerDataviewWithTextColumns(); dataView = Env.CreateTransform("Term{col=F1}", dataView); From 687ceed8443406faaccc640fb569f7c9190f5aff Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Wed, 20 Jun 2018 10:26:47 -0700 Subject: [PATCH 10/20] address mistype, get rid of WeightedValue --- .../EntryPoints/CreateEnsemble.cs | 16 ++-------------- .../EntryPoints/DiversityMeasure.cs | 4 ++-- .../BestDiverseSelectorMultiClass.cs | 2 +- .../Trainer/Binary/EnsembleTrainer.cs | 18 +++++------------- .../MulticlassDataPartitionEnsembleTrainer.cs | 12 ++++-------- .../Regression/RegressionEnsembleTrainer.cs | 13 ++++--------- src/Microsoft.ML.Ensemble/WeightedValue.cs | 14 -------------- 7 files changed, 18 insertions(+), 61 deletions(-) delete mode 100644 src/Microsoft.ML.Ensemble/WeightedValue.cs diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs index 8078cac1f1..7978ffb761 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/CreateEnsemble.cs @@ -131,16 +131,6 @@ private static void GetPipeline(IHostEnvironment env, InputBase input, out IData } } - private static IEnumerable> GetWeightedModels(IEnumerable models) - where T : class, IPredictor - { - return models.Select(predictor => new WeightedValue() - { - Value = predictor.Predictor as T, - Weight = 1 - }); - } - [TlcModule.EntryPoint(Name = "Models.BinaryEnsemble", Desc = "Combine binary classifiers into an ensemble", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHostEnvironment env, ClassifierInput input) { @@ -168,8 +158,7 @@ public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHos } var trainer = new EnsembleTrainer(host, args); - var weightedModels = GetWeightedModels>(input.Models); - var ensemble = trainer.CombineModels(weightedModels); + var ensemble = trainer.CombineModels(input.Models.Select(pm => pm.Predictor as IPredictorProducing)); var predictorModel = new PredictorModel(host, transformedData, startingData, ensemble); @@ -201,8 +190,7 @@ public static CommonOutputs.RegressionOutput CreateRegressionEnsemble(IHostEnvir } var trainer = new RegressionEnsembleTrainer(host, args); - var weightedModels = GetWeightedModels>(input.Models); - var ensemble = trainer.CombineModels(weightedModels); + var ensemble = trainer.CombineModels(input.Models.Select(pm => pm.Predictor as IPredictorProducing)); var predictorModel = new PredictorModel(host, transformedData, startingData, ensemble); diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs b/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs index 6dcaedd3ca..889ba80411 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/DiversityMeasure.cs @@ -12,7 +12,7 @@ [assembly: EntryPointModule(typeof(DisagreementDiversityFactory))] [assembly: EntryPointModule(typeof(RegressionDisagreementDiversityFactory))] -[assembly: EntryPointModule(typeof(MultinDisagreementDiversityFactory))] +[assembly: EntryPointModule(typeof(MultiDisagreementDiversityFactory))] namespace Microsoft.ML.Ensemble.EntryPoints { @@ -29,7 +29,7 @@ public sealed class RegressionDisagreementDiversityFactory : ISupportDiversityMe } [TlcModule.Component(Name = MultiDisagreementDiversityMeasure.LoadName, FriendlyName = DisagreementDiversityMeasure.UserName)] - public sealed class MultinDisagreementDiversityFactory : ISupportDiversityMeasureFactory> + public sealed class MultiDisagreementDiversityFactory : ISupportDiversityMeasureFactory> { public IDiversityMeasure> CreateComponent(IHostEnvironment env) => new MultiDisagreementDiversityMeasure(); } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs index c41fb89f7d..3c4bfc5471 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -23,7 +23,7 @@ public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector> DefaultDiversityMetricType => new MultinDisagreementDiversityFactory(); + protected override ISupportDiversityMeasureFactory> DefaultDiversityMetricType => new MultiDisagreementDiversityFactory(); [TlcModule.Component(Name = BestDiverseSelectorMultiClass.LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportMulticlassSubModelSelectorFactory diff --git a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs index c178fd9f6a..a05c2811dc 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs @@ -29,7 +29,7 @@ namespace Microsoft.ML.Runtime.Ensemble /// public sealed class EnsembleTrainer : EnsembleTrainerBase, - IModelCombiner, TScalarPredictor> + IModelCombiner { public const string LoadNameValue = "WeightedEnsemble"; public const string UserNameValue = "Parallel Ensemble (bagging, stacking, etc)"; @@ -76,32 +76,24 @@ public override TScalarPredictor CreatePredictor() return new EnsemblePredictor(Host, PredictionKind, CreateModels(), Combiner); } - public TScalarPredictor CombineModels(IEnumerable> models) + public TScalarPredictor CombineModels(IEnumerable models) { - var weights = models.Select(m => m.Weight).ToArray(); - if (weights.All(w => w == 1)) - weights = null; var combiner = _outputCombiner.CreateComponent(Host); - var p = models.First().Value; + var p = models.First(); TScalarPredictor predictor = null; if (p is TDistPredictor) { predictor = new EnsembleDistributionPredictor(Host, p.PredictionKind, - models.Select(k => new FeatureSubsetModel((TDistPredictor)k.Value)).ToArray(), - combiner, - weights); + models.Select(k => new FeatureSubsetModel((TDistPredictor)k)).ToArray(), combiner); } else { predictor = new EnsemblePredictor(Host, p.PredictionKind, - models.Select(k => new FeatureSubsetModel(k.Value)).ToArray(), - combiner, - weights); + models.Select(k => new FeatureSubsetModel(k)).ToArray(), combiner); } return predictor; } } - } \ No newline at end of file diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs index 7914753a83..97be1a5a8a 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs @@ -31,7 +31,7 @@ namespace Microsoft.ML.Runtime.Ensemble public sealed class MulticlassDataPartitionEnsembleTrainer : EnsembleTrainerBase, EnsembleMultiClassPredictor, IMulticlassSubModelSelector, IMultiClassOutputCombiner, SignatureMultiClassClassifierTrainer>, - IModelCombiner, TVectorPredictor> + IModelCombiner { public const string LoadNameValue = "WeightedEnsembleMulticlass"; public const string UserNameValue = "Multi-class Parallel Ensemble (bagging, stacking, etc)"; @@ -73,15 +73,11 @@ public override EnsembleMultiClassPredictor CreatePredictor() return new EnsembleMultiClassPredictor(Host, CreateModels(), combiner as IMultiClassOutputCombiner); } - public TVectorPredictor CombineModels(IEnumerable> models) + public TVectorPredictor CombineModels(IEnumerable models) { - var weights = models.Select(m => m.Weight).ToArray(); - if (weights.All(w => w == 1)) - weights = null; - var predictor = new EnsembleMultiClassPredictor(Host, - models.Select(k => new FeatureSubsetModel(k.Value)).ToArray(), - _outputCombiner.CreateComponent(Host), weights); + models.Select(k => new FeatureSubsetModel(k)).ToArray(), + _outputCombiner.CreateComponent(Host)); return predictor; } diff --git a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs index 29643dc971..a43624a559 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs @@ -24,7 +24,7 @@ namespace Microsoft.ML.Runtime.Ensemble using TScalarPredictor = IPredictorProducing; public sealed class RegressionEnsembleTrainer : EnsembleTrainerBase, - IModelCombiner, TScalarPredictor> + IModelCombiner { public const string LoadNameValue = "EnsembleRegression"; public const string UserNameValue = "Regression Ensemble (bagging, stacking, etc)"; @@ -67,18 +67,13 @@ public override TScalarPredictor CreatePredictor() return new EnsemblePredictor(Host, PredictionKind, CreateModels(), Combiner); } - public TScalarPredictor CombineModels(IEnumerable> models) + public TScalarPredictor CombineModels(IEnumerable models) { - var weights = models.Select(m => m.Weight).ToArray(); - if (weights.All(w => w == 1)) - weights = null; var combiner = _outputCombiner.CreateComponent(Host); - var p = models.First().Value; + var p = models.First(); var predictor = new EnsemblePredictor(Host, p.PredictionKind, - models.Select(k => new FeatureSubsetModel(k.Value)).ToArray(), - combiner, - weights); + models.Select(k => new FeatureSubsetModel(k)).ToArray(), combiner); return predictor; } diff --git a/src/Microsoft.ML.Ensemble/WeightedValue.cs b/src/Microsoft.ML.Ensemble/WeightedValue.cs deleted file mode 100644 index 9c655ff4db..0000000000 --- a/src/Microsoft.ML.Ensemble/WeightedValue.cs +++ /dev/null @@ -1,14 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; - -namespace Microsoft.ML.Runtime.Ensemble -{ - public struct WeightedValue - { - public T Value; - public Single Weight; - } -} From 71ea37bd048cfcbff7aac0cc6abb259c7846831b Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Wed, 20 Jun 2018 12:45:14 -0700 Subject: [PATCH 11/20] small cleanup --- .../EntryPoints/PipelineEnsemble.cs | 5 ++++- .../EntryPoints/SubModelSelector.cs | 7 ------- .../FeatureSubsetModel.cs | 3 +-- .../OutputCombiners/Average.cs | 3 +-- .../OutputCombiners/MultiWeightedAverage.cs | 18 +++++++++--------- src/Microsoft.ML.Ensemble/PipelineEnsemble.cs | 12 ++++-------- .../BaseDisagreementDiversityMeasure.cs | 2 +- .../Selector/SubModelSelector/AllSelector.cs | 4 ++-- .../SubModelSelector/AllSelectorMultiClass.cs | 4 ++-- .../BaseBestPerformanceSelector.cs | 5 +---- .../BestDiverseSelectorBinary.cs | 7 ++----- .../BestDiverseSelectorMultiClass.cs | 7 ++----- .../BestDiverseSelectorRegression.cs | 7 ++----- .../BestPerformanceRegressionSelector.cs | 15 +++------------ .../BestPerformanceSelector.cs | 15 +++------------ .../BestPerformanceSelectorMultiClass.cs | 16 +++------------- .../SubsetSelector/RandomPartitionSelector.cs | 2 +- .../Trainer/EnsembleDistributionPredictor.cs | 15 +++++---------- .../Trainer/EnsemblePredictor.cs | 7 +++---- .../Trainer/EnsemblePredictorBase.cs | 5 +---- .../Trainer/EnsembleTrainerBase.cs | 16 +++++++--------- 21 files changed, 57 insertions(+), 118 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs index 7b629bb498..bcfaaefb89 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/PipelineEnsemble.cs @@ -30,7 +30,10 @@ public static SummaryOutput Summarize(IHostEnvironment env, SummarizePredictor.I host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - input.PredictorModel.PrepareData(host, new EmptyDataView(host, input.PredictorModel.TransformModel.InputSchema), out RoleMappedData rmd, out IPredictor predictor); + input.PredictorModel.PrepareData(host, + new EmptyDataView(host, input.PredictorModel.TransformModel.InputSchema), + out RoleMappedData rmd, out IPredictor predictor +); var calibrated = predictor as CalibratedPredictorBase; while (calibrated != null) diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs index a5c10e86f2..57001190ac 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/SubModelSelector.cs @@ -24,8 +24,6 @@ namespace Microsoft.ML.Ensemble.EntryPoints [TlcModule.Component(Name = AllSelector.LoadName, FriendlyName = AllSelector.UserName)] public sealed class AllSelectorFactory : ISupportBinarySubModelSelectorFactory, ISupportRegressionSubModelSelectorFactory { - public ISubModelSelector CreateComponent(IHostEnvironment env) => new AllSelector(env); - IBinarySubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllSelector(env); IRegressionSubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllSelector(env); @@ -34,11 +32,6 @@ public sealed class AllSelectorFactory : ISupportBinarySubModelSelectorFactory, [TlcModule.Component(Name = AllSelectorMultiClass.LoadName, FriendlyName = AllSelectorMultiClass.UserName)] public sealed class AllSelectorMultiClassFactory : ISupportMulticlassSubModelSelectorFactory { - public ISubModelSelector> CreateComponent(IHostEnvironment env) - { - throw new NotImplementedException(); - } - IMulticlassSubModelSelector IComponentFactory.CreateComponent(IHostEnvironment env) => new AllSelectorMultiClass(env); } } diff --git a/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs b/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs index 6eb373b6e6..4518666d34 100644 --- a/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs +++ b/src/Microsoft.ML.Ensemble/FeatureSubsetModel.cs @@ -8,8 +8,7 @@ namespace Microsoft.ML.Runtime.Ensemble { - public sealed class FeatureSubsetModel - where TPredictor : IPredictor + public sealed class FeatureSubsetModel where TPredictor : IPredictor { public readonly TPredictor Predictor; public readonly BitArray SelectedFeatures; diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs index a4d732fe2d..45cd764d13 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/Average.cs @@ -55,8 +55,7 @@ protected override void SaveCore(ModelSaveContext ctx) public override Combiner GetCombiner() { // Force the weights to null. - return - (ref Single dst, Single[] src, Single[] weights) => + return(ref Single dst, Single[] src, Single[] weights) => CombineCore(ref dst, src, null); } } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs index 66bf23cfa2..9bda1d151a 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiWeightedAverage.cs @@ -19,15 +19,6 @@ namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { - // These values are serialized, so should not be changed. - public enum MultiWeightageKind - { - [TGUI(Label = MultiClassClassifierEvaluator.AccuracyMicro)] - AccuracyMicroAvg = 0, - [TGUI(Label = MultiClassClassifierEvaluator.AccuracyMacro)] - AccuracyMacroAvg = 1 - } - /// /// Generic interface for combining outputs of multiple models /// @@ -101,4 +92,13 @@ public override Combiner> GetCombiner() return CombineCore; } } + + // These values are serialized, so should not be changed. + public enum MultiWeightageKind + { + [TGUI(Label = MultiClassClassifierEvaluator.AccuracyMicro)] + AccuracyMicroAvg = 0, + [TGUI(Label = MultiClassClassifierEvaluator.AccuracyMacro)] + AccuracyMacroAvg = 1 + } } diff --git a/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs b/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs index 54afc9ccd2..43956c608f 100644 --- a/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs +++ b/src/Microsoft.ML.Ensemble/PipelineEnsemble.cs @@ -27,7 +27,8 @@ namespace Microsoft.ML.Runtime.Ensemble /// useful for the distributed training scenario, where the featurization includes trainable transforms (for example, /// categorical transform, or normalization). /// - public abstract class SchemaBindablePipelineEnsembleBase : ICanGetTrainingLabelNames, ICanSaveModel, ISchemaBindableMapper, ICanSaveSummary, ICanGetSummaryInKeyValuePairs + public abstract class SchemaBindablePipelineEnsembleBase : ICanGetTrainingLabelNames, ICanSaveModel, + ISchemaBindableMapper, ICanSaveSummary, ICanGetSummaryInKeyValuePairs { private abstract class BoundBase : ISchemaBoundRowMapper { @@ -38,13 +39,8 @@ private abstract class BoundBase : ISchemaBoundRowMapper protected readonly IRowToRowMapper[] BoundPipelines; protected readonly int[] ScoreCols; - public ISchemaBindableMapper Bindable - { - get { return Parent; } - } - + public ISchemaBindableMapper Bindable => Parent; public RoleMappedSchema InputSchema { get; } - public ISchema OutputSchema { get; } public BoundBase(SchemaBindablePipelineEnsembleBase parent, RoleMappedSchema schema) @@ -226,7 +222,7 @@ public override ISchemaBoundMapper Bind(IHostEnvironment env, RoleMappedSchema s // This is an implementation of pipeline ensembles that combines scores of type float (regression and anomaly detection). private sealed class ImplOne : SchemaBindablePipelineEnsemble { - protected override ColumnType ScoreType { get { return NumberType.R4; } } + protected override ColumnType ScoreType => NumberType.R4; public override PredictionKind PredictionKind { diff --git a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs index b2f7bad432..6a34da757a 100644 --- a/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs +++ b/src/Microsoft.ML.Ensemble/Selector/DiversityMeasure/BaseDisagreementDiversityMeasure.cs @@ -16,7 +16,7 @@ public List> CalculateDiversityMeasure(IList 1); Contracts.Assert(predictions.Count == models.Count); - List> diversityValues = new List>(); + var diversityValues = new List>(); for (int i = 0; i < (models.Count - 1); i++) { diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs index cc3586b184..4196ab3558 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelector.cs @@ -16,9 +16,9 @@ public class AllSelector : BaseSubModelSelector, IBinarySubModelSelector public const string UserName = "All Selector"; public const string LoadName = "AllSelector"; - public override Single ValidationDatasetProportion { get { return 0; } } + public override Single ValidationDatasetProportion => 0; - protected override PredictionKind PredictionKind { get { return PredictionKind.BinaryClassification; } } + protected override PredictionKind PredictionKind => PredictionKind.BinaryClassification; public AllSelector(IHostEnvironment env) : base(env, LoadName) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs index 39912427eb..6c82fc25f5 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/AllSelectorMultiClass.cs @@ -18,9 +18,9 @@ public class AllSelectorMultiClass : BaseSubModelSelector>, IMul public const string UserName = "All Selector"; public const string LoadName = "AllSelectorMultiClass"; - public override Single ValidationDatasetProportion { get { return 0; } } + public override Single ValidationDatasetProportion => 0; - protected override PredictionKind PredictionKind { get { return PredictionKind.MultiClassClassification; } } + protected override PredictionKind PredictionKind => PredictionKind.MultiClassClassification; public AllSelectorMultiClass(IHostEnvironment env) : base(env, LoadName) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs index 3126a82885..8701e2833c 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BaseBestPerformanceSelector.cs @@ -14,10 +14,7 @@ public abstract class BaseBestPerformanceSelector : SubModelDataSelecto { protected abstract string MetricName { get; } - protected virtual bool IsAscMetric - { - get { return true; } - } + protected virtual bool IsAscMetric => true; protected BaseBestPerformanceSelector(ArgumentsBase args, IHostEnvironment env, string name) : base(args, env, name) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs index 1ab403dd07..918fe16b7e 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs @@ -23,7 +23,7 @@ public sealed class BestDiverseSelectorBinary : BaseDiverseSelector DefaultDiversityMetricType => new DisagreementDiversityFactory(); + protected override ISupportDiversityMeasureFactory DefaultDiversityMetricType => new DisagreementDiversityFactory(); [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportBinarySubModelSelectorFactory @@ -43,9 +43,6 @@ public override List> CalculateDiversityMeasure(ILi return diversityMetric.CalculateDiversityMeasure(models, predictions); } - protected override PredictionKind PredictionKind - { - get { return PredictionKind.BinaryClassification; } - } + protected override PredictionKind PredictionKind => PredictionKind.BinaryClassification; } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs index 3c4bfc5471..eb5e229cc4 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -23,7 +23,7 @@ public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector> DefaultDiversityMetricType => new MultiDisagreementDiversityFactory(); + protected override ISupportDiversityMeasureFactory> DefaultDiversityMetricType => new MultiDisagreementDiversityFactory(); [TlcModule.Component(Name = BestDiverseSelectorMultiClass.LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportMulticlassSubModelSelectorFactory @@ -36,10 +36,7 @@ public BestDiverseSelectorMultiClass(IHostEnvironment env, Arguments args) { } - protected override PredictionKind PredictionKind - { - get { return PredictionKind.MultiClassClassification; } - } + protected override PredictionKind PredictionKind => PredictionKind.MultiClassClassification; public override List>> CalculateDiversityMeasure(IList> models, ConcurrentDictionary, VBuffer[]> predictions) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs index aaa970c471..5b950b4541 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs @@ -23,7 +23,7 @@ public sealed class BestDiverseSelectorRegression : BaseDiverseSelector DefaultDiversityMetricType => new RegressionDisagreementDiversityFactory(); + protected override ISupportDiversityMeasureFactory DefaultDiversityMetricType => new RegressionDisagreementDiversityFactory(); [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] public sealed class Arguments : DiverseSelectorArguments, ISupportRegressionSubModelSelectorFactory @@ -43,9 +43,6 @@ public override List> CalculateDiversityMeasure(ILi return diversityMetric.CalculateDiversityMeasure(models, predictions); } - protected override PredictionKind PredictionKind - { - get { return PredictionKind.Regression; } - } + protected override PredictionKind PredictionKind => PredictionKind.Regression; } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs index 75201d8fb6..46f13e9cd1 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceRegressionSelector.cs @@ -44,19 +44,10 @@ public BestPerformanceRegressionSelector(IHostEnvironment env, Arguments args) Host.Assert(!string.IsNullOrEmpty(_metricName)); } - protected override string MetricName - { - get { return _metricName; } - } + protected override string MetricName => _metricName; - protected override bool IsAscMetric - { - get { return false; } - } + protected override bool IsAscMetric => false; - protected override PredictionKind PredictionKind - { - get { return PredictionKind.Regression; } - } + protected override PredictionKind PredictionKind => PredictionKind.Regression; } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs index e8257cd3a8..76742ad0ec 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelector.cs @@ -44,19 +44,10 @@ public BestPerformanceSelector(IHostEnvironment env, Arguments args) Host.Assert(!string.IsNullOrEmpty(_metricName)); } - protected override string MetricName - { - get { return _metricName; } - } + protected override string MetricName => _metricName; - protected override bool IsAscMetric - { - get { return _metric != BinaryClassifierEvaluator.Metrics.LogLoss; } - } + protected override bool IsAscMetric => _metric != BinaryClassifierEvaluator.Metrics.LogLoss; - protected override PredictionKind PredictionKind - { - get { return PredictionKind.BinaryClassification; } - } + protected override PredictionKind PredictionKind => PredictionKind.BinaryClassification; } } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs index fe98ff50b2..0a9b9ac497 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestPerformanceSelectorMultiClass.cs @@ -44,20 +44,10 @@ public BestPerformanceSelectorMultiClass(IHostEnvironment env, Arguments args) Host.Assert(!string.IsNullOrEmpty(_metricName)); } - protected override PredictionKind PredictionKind - { - get { return PredictionKind.MultiClassClassification; } - } + protected override PredictionKind PredictionKind => PredictionKind.MultiClassClassification; - protected override bool IsAscMetric - { - get { return _metric != MultiClassClassifierEvaluator.Metrics.LogLoss; } - } + protected override bool IsAscMetric => _metric != MultiClassClassifierEvaluator.Metrics.LogLoss; - protected override string MetricName - { - get { return _metricName; } - } + protected override string MetricName => _metricName; } - } diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs index db11d0914c..322a2133dc 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/RandomPartitionSelector.cs @@ -41,7 +41,7 @@ public override IEnumerable GetSubsets(Batch batch, IRandom rand) args.Seed = (uint)rand.Next(); IDataTransform view = new GenerateNumberTransform(Host, args, Data.Data); - // REVIEW: This won't be very efficient when _size is large. + // REVIEW: This won't be very efficient when Size is large. for (int i = 0; i < Size; i++) { var viewTrain = new RangeFilter(Host, new RangeFilter.Arguments() { Column = name, Min = (Double)i / Size, Max = (Double)(i + 1) / Size }, view); diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs index 139cf9207d..547800c152 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleDistributionPredictor.cs @@ -20,10 +20,9 @@ namespace Microsoft.ML.Runtime.Ensemble { using TDistPredictor = IDistPredictorProducing; - public sealed class EnsembleDistributionPredictor : - EnsemblePredictorBase, - TDistPredictor, - IValueMapperDist + + public sealed class EnsembleDistributionPredictor : EnsemblePredictorBase, + TDistPredictor, IValueMapperDist { public const string UserName = "Ensemble Distribution Executor"; public const string LoaderSignature = "EnsemDbExec"; @@ -42,14 +41,12 @@ private static VersionInfo GetVersionInfo() } private readonly Single[] _averagedWeights; - private readonly Median _probabilityCombiner; - private readonly IValueMapperDist[] _mappers; public ColumnType InputType { get; } - public ColumnType OutputType { get { return NumberType.Float; } } - public ColumnType DistType { get { return NumberType.Float; } } + public ColumnType OutputType => NumberType.Float; + public ColumnType DistType => NumberType.Float; public override PredictionKind PredictionKind { get; } @@ -121,8 +118,6 @@ protected override void SaveCore(ModelSaveContext ctx) ctx.Writer.Write((int)PredictionKind); } - - public ValueMapper GetMapper() { Host.Check(typeof(TIn) == typeof(VBuffer)); diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs index 514257a643..08c8f0dd8d 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictor.cs @@ -19,9 +19,8 @@ namespace Microsoft.ML.Runtime.Ensemble { using TScalarPredictor = IPredictorProducing; - public sealed class EnsemblePredictor : - EnsemblePredictorBase, - IValueMapper + + public sealed class EnsemblePredictor : EnsemblePredictorBase, IValueMapper { public const string UserName = "Ensemble Executor"; public const string LoaderSignature = "EnsembleFloatExec"; @@ -42,7 +41,7 @@ private static VersionInfo GetVersionInfo() private readonly IValueMapper[] _mappers; public ColumnType InputType { get; } - public ColumnType OutputType { get { return NumberType.Float; } } + public ColumnType OutputType => NumberType.Float; public override PredictionKind PredictionKind { get; } internal EnsemblePredictor(IHostEnvironment env, PredictionKind kind, diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs index ff24d38996..9f2ebfb804 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsemblePredictorBase.cs @@ -14,10 +14,7 @@ namespace Microsoft.ML.Runtime.Ensemble { public abstract class EnsemblePredictorBase : PredictorBase, - IPredictorProducing, - ICanSaveInTextFormat, - ICanSaveModel, - ICanSaveSummary + IPredictorProducing, ICanSaveInTextFormat, ICanSaveModel, ICanSaveSummary where TPredictor : class, IPredictorProducing { private const string SubPredictorFmt = "SubPredictor_{0:000}"; diff --git a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs index b418a7ccb1..776b1f5f53 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/EnsembleTrainerBase.cs @@ -19,6 +19,7 @@ namespace Microsoft.ML.Runtime.Ensemble { using Stopwatch = System.Diagnostics.Stopwatch; + public abstract class EnsembleTrainerBase : TrainerBase where TPredictor : class, IPredictorProducing where TSelector : class, ISubModelSelector @@ -52,14 +53,11 @@ public abstract class ArgumentsBase : LearnerInputBaseWithLabel [TGUI(Label = "Show Sub-Model Metrics")] public bool ShowMetrics; - - - [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1, Visibility =ArgumentAttribute.VisibilityType.CmdLineOnly)] + [Argument(ArgumentType.Multiple, HelpText = "Base predictor type", ShortName = "bp,basePredictorTypes", SortOrder = 1, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly)] public SubComponent>, TSig>[] BasePredictors; - - public const int DefaultNumModels = 50; } + private const int DefaultNumModels = 50; /// Command-line arguments protected readonly ArgumentsBase Args; protected readonly int NumModels; @@ -86,7 +84,7 @@ internal EnsembleTrainerBase(ArgumentsBase args, IHostEnvironment env, string na ch.CheckUserArg(Utils.Size(Args.BasePredictors) > 0, nameof(Args.BasePredictors), "This should have at-least one value"); NumModels = Args.NumModels ?? - (Args.BasePredictors.Length == 1 ? ArgumentsBase.DefaultNumModels : Args.BasePredictors.Length); + (Args.BasePredictors.Length == 1 ? DefaultNumModels : Args.BasePredictors.Length); ch.CheckUserArg(NumModels > 0, nameof(Args.NumModels), "Must be positive, or null to indicate numModels is the number of base predictors"); @@ -112,13 +110,13 @@ internal EnsembleTrainerBase(ArgumentsBase args, IHostEnvironment env, string na } } - public override bool NeedNormalization { get { return _needNorm; } } + public override bool NeedNormalization => _needNorm; - public override bool NeedCalibration { get { return _needCalibration; } } + public override bool NeedCalibration => _needCalibration; // No matter the internal predictors, we are performing multiple passes over the data // so it is probably appropriate to always cache. - public override bool WantCaching { get { return true; } } + public override bool WantCaching => true; public override void Train(RoleMappedData data) { From c1028c670e9e544ef1868ab5bf24d4fe11b8f96e Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Thu, 21 Jun 2018 10:39:18 -0700 Subject: [PATCH 12/20] merge with master, remove useless using --- .../SubModelSelector/BestDiverseSelectorBinary.cs | 1 + .../BestDiverseSelectorMultiClass.cs | 1 + .../BestDiverseSelectorRegression.cs | 1 + .../SubModelSelector/SubModelDataSelector.cs | 15 +++++++++------ .../Trainer/Binary/EnsembleTrainer.cs | 2 -- .../MulticlassDataPartitionEnsembleTrainer.cs | 2 -- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs index 918fe16b7e..b3deb891f0 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorBinary.cs @@ -18,6 +18,7 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { using TScalarPredictor = IPredictorProducing; + public sealed class BestDiverseSelectorBinary : BaseDiverseSelector, IBinarySubModelSelector { public const string UserName = "Best Diverse Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs index eb5e229cc4..853297970d 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorMultiClass.cs @@ -19,6 +19,7 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { using TVectorPredictor = IPredictorProducing>; + public sealed class BestDiverseSelectorMultiClass : BaseDiverseSelector, IDiversityMeasure>>, IMulticlassSubModelSelector { public const string UserName = "Best Diverse Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs index 5b950b4541..d4d6979fe1 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/BestDiverseSelectorRegression.cs @@ -18,6 +18,7 @@ namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { using TScalarPredictor = IPredictorProducing; + public sealed class BestDiverseSelectorRegression : BaseDiverseSelector, IRegressionSubModelSelector { public const string UserName = "Best Diverse Selector"; diff --git a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs index c7e228cc9e..5953b30d97 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubModelSelector/SubModelDataSelector.cs @@ -1,21 +1,24 @@ -using Microsoft.ML.Runtime.CommandLine; -using Microsoft.ML.Runtime.Internal.Internallearn; -using System; -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using Microsoft.ML.Runtime.CommandLine; +using Microsoft.ML.Runtime.Internal.Internallearn; + namespace Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector { public abstract class SubModelDataSelector : BaseSubModelSelector { public abstract class ArgumentsBase { - [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of best base learners to be selected. The range is 0.0-1.0", ShortName = "lp", SortOrder = 50)] + [Argument(ArgumentType.AtMostOnce, ShortName = "lp", SortOrder = 50, + HelpText = "The proportion of best base learners to be selected. The range is 0.0-1.0")] [TGUI(Label = "Learners Selection Proportion")] public Single LearnersSelectionProportion = 0.5f; - [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", ShortName = "vp", SortOrder = 50)] + [Argument(ArgumentType.AtMostOnce, ShortName = "vp", SortOrder = 50, + HelpText = "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set")] [TGUI(Label = "Validation Dataset Proportion")] public Single ValidationDatasetProportion = 0.3f; } diff --git a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs index a05c2811dc..c051d36231 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs @@ -9,10 +9,8 @@ using Microsoft.ML.Runtime.CommandLine; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Ensemble; -using Microsoft.ML.Runtime.Learners; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; -using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; using Microsoft.ML.Ensemble.EntryPoints; using Microsoft.ML.Runtime.Internal.Internallearn; diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs index 97be1a5a8a..0cdd147c55 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs @@ -12,9 +12,7 @@ using Microsoft.ML.Runtime.Ensemble; using Microsoft.ML.Runtime.Ensemble.OutputCombiners; using Microsoft.ML.Runtime.Ensemble.Selector; -using Microsoft.ML.Runtime.Ensemble.Selector.SubModelSelector; using Microsoft.ML.Runtime.Internal.Internallearn; -using Microsoft.ML.Runtime.Learners; [assembly: LoadableClass(MulticlassDataPartitionEnsembleTrainer.Summary, typeof(MulticlassDataPartitionEnsembleTrainer), typeof(MulticlassDataPartitionEnsembleTrainer.Arguments), From 3c867ba0ee9b12d5948ab171027e998720ea7cd4 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Fri, 22 Jun 2018 12:11:04 -0700 Subject: [PATCH 13/20] adress comments --- .../EntryPoints/Ensemble.cs | 2 +- .../OutputCombiners/MultiStacking.cs | 2 +- .../OutputCombiners/RegressionStacking.cs | 2 +- .../FeatureSelector/RandomFeatureSelector.cs | 2 +- .../Trainer/Binary/EnsembleTrainer.cs | 6 ++---- .../MulticlassDataPartitionEnsembleTrainer.cs | 6 ++---- .../Regression/RegressionEnsembleTrainer.cs | 6 ++---- src/Microsoft.ML/CSharpApi.cs | 18 +++++++++--------- .../Common/EntryPoints/core_ep-list.tsv | 2 +- .../Common/EntryPoints/core_manifest.json | 2 +- .../UnitTests/TestEntryPoints.cs | 2 +- 11 files changed, 22 insertions(+), 28 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs index d00a835192..c49e393a12 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs @@ -13,7 +13,7 @@ namespace Microsoft.ML.Ensemble.EntryPoints { public static class Ensemble { - [TlcModule.EntryPoint(Name = "Trainers.BinaryEnsemble", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.BinaryClassifierEnsemble", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHostEnvironment env, EnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs index 1207af2c5d..588dd89508 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/MultiStacking.cs @@ -42,7 +42,7 @@ public sealed class Arguments : ArgumentsBase, ISupportMulticlassOutputCombinerF public Arguments() { - // REVIEW: Kinda stupid. Perhaps we can have a better non-parametetric learner. + // REVIEW: Perhaps we can have a better non-parametetric learner. BasePredictorType = new SubComponent, SignatureMultiClassClassifierTrainer>( "OVA", "p=FastTreeBinaryClassification"); } diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs index 0bf435aaca..65673f4d5b 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs @@ -10,7 +10,7 @@ using Microsoft.ML.Runtime.Model; [assembly: LoadableClass(typeof(RegressionStacking), typeof(RegressionStacking.Arguments), typeof(SignatureCombiner), - Stacking.UserName, RegressionStacking.LoadName)] + Stacking.UserName, RegressionStacking.LoaderSignature)] [assembly: LoadableClass(typeof(RegressionStacking), null, typeof(SignatureLoadModel), Stacking.UserName, RegressionStacking.LoadName)] diff --git a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs index 6e709388a7..c0c9b8968f 100644 --- a/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/FeatureSelector/RandomFeatureSelector.cs @@ -22,7 +22,7 @@ public class RandomFeatureSelector : IFeatureSelector public const string UserName = "Random Feature Selector"; public const string LoadName = "RandomFeatureSelector"; - [TlcModule.Component(Name = RandomFeatureSelector.LoadName, FriendlyName = RandomFeatureSelector.UserName)] + [TlcModule.Component(Name = LoadName, FriendlyName = UserName)] public sealed class Arguments: ISupportFeatureSelectorFactory { [Argument(ArgumentType.AtMostOnce, HelpText = "The proportion of features to be selected. The range is 0.0-1.0", ShortName = "fp", SortOrder = 50)] diff --git a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs index c051d36231..80fe4cbdee 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Binary/EnsembleTrainer.cs @@ -38,17 +38,15 @@ public sealed class Arguments : ArgumentsBase [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] [TGUI(Label = "Sub-Model Selector(pruning) Type", Description = "Algorithm to prune the base learners for selective Ensemble")] - public ISupportBinarySubModelSelectorFactory SubModelSelectorType; + public ISupportBinarySubModelSelectorFactory SubModelSelectorType = new AllSelectorFactory(); [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] [TGUI(Label = "Output combiner", Description = "Output combiner type")] - public ISupportBinaryOutputCombinerFactory OutputCombiner; + public ISupportBinaryOutputCombinerFactory OutputCombiner = new MedianFactory(); public Arguments() { BasePredictors = new[] { new SubComponent, SignatureBinaryClassifierTrainer>("LinearSVM") }; - OutputCombiner = new MedianFactory(); - SubModelSelectorType = new AllSelectorFactory(); } } diff --git a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs index 0cdd147c55..0e6b4f6a53 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Multiclass/MulticlassDataPartitionEnsembleTrainer.cs @@ -39,17 +39,15 @@ public sealed class Arguments : ArgumentsBase { [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] [TGUI(Label = "Sub-Model Selector(pruning) Type", Description = "Algorithm to prune the base learners for selective Ensemble")] - public ISupportMulticlassSubModelSelectorFactory SubModelSelectorType; + public ISupportMulticlassSubModelSelectorFactory SubModelSelectorType = new AllSelectorMultiClassFactory(); [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] [TGUI(Label = "Output combiner", Description = "Output combiner type")] - public ISupportMulticlassOutputCombinerFactory OutputCombiner; + public ISupportMulticlassOutputCombinerFactory OutputCombiner = new MultiMedian.Arguments(); public Arguments() { BasePredictors = new[] { new SubComponent, SignatureMultiClassClassifierTrainer>("MultiClassLogisticRegression") }; - OutputCombiner = new MultiMedian.Arguments(); - SubModelSelectorType = new AllSelectorMultiClassFactory(); } } diff --git a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs index a43624a559..322c1e02a1 100644 --- a/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs +++ b/src/Microsoft.ML.Ensemble/Trainer/Regression/RegressionEnsembleTrainer.cs @@ -33,17 +33,15 @@ public sealed class Arguments : ArgumentsBase { [Argument(ArgumentType.Multiple, HelpText = "Algorithm to prune the base learners for selective Ensemble", ShortName = "pt", SortOrder = 4)] [TGUI(Label = "Sub-Model Selector(pruning) Type", Description = "Algorithm to prune the base learners for selective Ensemble")] - public ISupportRegressionSubModelSelectorFactory SubModelSelectorType; + public ISupportRegressionSubModelSelectorFactory SubModelSelectorType = new AllSelectorFactory(); [Argument(ArgumentType.Multiple, HelpText = "Output combiner", ShortName = "oc", SortOrder = 5)] [TGUI(Label = "Output combiner", Description = "Output combiner type")] - public ISupportRegressionOutputCombinerFactory OutputCombiner; + public ISupportRegressionOutputCombinerFactory OutputCombiner = new MedianFactory(); public Arguments() { BasePredictors = new[] { new SubComponent, SignatureRegressorTrainer>("OnlineGradientDescent") }; - OutputCombiner = new MedianFactory(); - SubModelSelectorType = new AllSelectorFactory(); } } diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index c47758763b..edfb967d1f 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -490,16 +490,16 @@ public void Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input, _jsonNodes.Add(Serialize("Trainers.AveragedPerceptronBinaryClassifier", input, output)); } - public Microsoft.ML.Trainers.BinaryEnsemble.Output Add(Microsoft.ML.Trainers.BinaryEnsemble input) + public Microsoft.ML.Trainers.BinaryClassifierEnsemble.Output Add(Microsoft.ML.Trainers.BinaryClassifierEnsemble input) { - var output = new Microsoft.ML.Trainers.BinaryEnsemble.Output(); + var output = new Microsoft.ML.Trainers.BinaryClassifierEnsemble.Output(); Add(input, output); return output; } - public void Add(Microsoft.ML.Trainers.BinaryEnsemble input, Microsoft.ML.Trainers.BinaryEnsemble.Output output) + public void Add(Microsoft.ML.Trainers.BinaryClassifierEnsemble input, Microsoft.ML.Trainers.BinaryClassifierEnsemble.Output output) { - _jsonNodes.Add(Serialize("Trainers.BinaryEnsemble", input, output)); + _jsonNodes.Add(Serialize("Trainers.BinaryClassifierEnsemble", input, output)); } public Microsoft.ML.Trainers.ClassificationEnsemble.Output Add(Microsoft.ML.Trainers.ClassificationEnsemble input) @@ -4177,7 +4177,7 @@ namespace Trainers /// /// Train binary ensemble. /// - public sealed partial class BinaryEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class BinaryClassifierEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { @@ -4261,18 +4261,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(BinaryEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(BinaryClassifierEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new BinaryEnsemblePipelineStep(output); + return new BinaryClassifierEnsemblePipelineStep(output); } - private class BinaryEnsemblePipelineStep : ILearningPipelinePredictorStep + private class BinaryClassifierEnsemblePipelineStep : ILearningPipelinePredictorStep { - public BinaryEnsemblePipelineStep(Output output) + public BinaryClassifierEnsemblePipelineStep(Output output) { Model = output.PredictorModel; } diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 7866586a79..9b86202085 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -37,7 +37,7 @@ Models.SweepResultExtractor Extracts the sweep result. Microsoft.ML.Runtime.Entr Models.TrainTestBinaryEvaluator Train test for binary classification Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro TrainTestBinary Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output] Models.TrainTestEvaluator General train test for any supported evaluator Microsoft.ML.Runtime.EntryPoints.TrainTestMacro TrainTest Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output] Trainers.AveragedPerceptronBinaryClassifier Train a Average perceptron. Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer TrainBinary Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput -Trainers.BinaryEnsemble Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput +Trainers.BinaryClassifierEnsemble Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.ClassificationEnsemble Train multiclass ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateMultiClassEnsemble Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput Trainers.FastForestBinaryClassifier Uses a random forest learner to perform binary classification. Microsoft.ML.Runtime.FastTree.FastForest TrainBinary Microsoft.ML.Runtime.FastTree.FastForestClassification+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.FastForestRegressor Trains a random forest to fit target values using least-squares. Microsoft.ML.Runtime.FastTree.FastForest TrainRegression Microsoft.ML.Runtime.FastTree.FastForestRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index eb9054b783..1874a35d3a 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -4065,7 +4065,7 @@ ] }, { - "Name": "Trainers.BinaryEnsemble", + "Name": "Trainers.BinaryClassifierEnsemble", "Desc": "Train binary ensemble.", "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", "ShortName": null, diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index a16c962b5c..4de91acf9d 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -424,7 +424,7 @@ public void EntryPointCreateEnsemble() var regEnsemble = EnsembleCreator.CreateRegressionEnsemble(Env, regEnsembleInput).PredictorModel; var regScored = ScoreModel.Score(Env, - new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = medEnsemble }).ScoredData; + new ScoreModel.Input { Data = splitOutput.TestData[nModels], PredictorModel = regEnsemble }).ScoredData; var zippedScores = ZipDataView.Create(Env, individualScores); From fd7c99e63be3afbbaa02d2b673e4a6d18faf86f8 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Fri, 22 Jun 2018 13:45:15 -0700 Subject: [PATCH 14/20] revert leaked changes from other PR. fix test binary ensemble test --- .../UnitTests/TestEntryPoints.cs | 2 +- .../TestPredictors.cs | 2 +- test/Microsoft.ML.TestFramework/Datasets.cs | 2 +- test/Microsoft.ML.TestFramework/Learners.cs | 37 ------------------- 4 files changed, 3 insertions(+), 40 deletions(-) diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 4de91acf9d..2dd0ffe57a 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1781,7 +1781,7 @@ public void EntryPointLinearSVM() [Fact] public void EntryPointBinaryEnsemble() { - TestEntryPointRoutine("iris.txt", "Trainers.BinaryEnsemble"); + TestEntryPointRoutine("iris.txt", "Trainers.BinaryClassifierEnsemble"); } [Fact] diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index e3aaa9d851..9bc183cd10 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -1820,4 +1820,4 @@ public void TestFeatureHandlerModelReuse() } } #endif -} \ No newline at end of file +} diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 22cf12a002..76bea677b6 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -157,7 +157,7 @@ public static class TestDatasets name = "wine", trainFilename = "external/winequality-white.csv", testFilename = "external/winequality-white.csv", - loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=semicolon header+" + loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=; header+" }; public static TestDataset msm = new TestDataset diff --git a/test/Microsoft.ML.TestFramework/Learners.cs b/test/Microsoft.ML.TestFramework/Learners.cs index 450bf98ac4..9b64c0ac74 100644 --- a/test/Microsoft.ML.TestFramework/Learners.cs +++ b/test/Microsoft.ML.TestFramework/Learners.cs @@ -748,42 +748,5 @@ public static PredictorAndArgs DssmDefault(int qryFeaturesCount, int docFeatures MamlArgs = new[] { "xf=Copy{col=DupFeatures:Features} xf=MinMax{col=Features col=DupFeatures} norm=No", "col[Feature]=DupFeatures" }, BaselineProgress = true }; - - public static PredictorAndArgs EnsembleRegression = new PredictorAndArgs - { - Trainer = new SubComponent("EnsembleRegression"), - Tag = "Default", - }; - - public static PredictorAndArgs EnsembleRegressionNumModels = new PredictorAndArgs - { - Trainer = new SubComponent("EnsembleRegression", "bp=OGD bp=FTR"), - Tag = "NumModels", - }; - - public static PredictorAndArgs EnsembleRegressionDiverseSelector = new PredictorAndArgs - { - Trainer = new SubComponent("EnsembleRegression", "pt = BestDiverseSelectorRegression"), - Tag = "DiverseRegression", - }; - - public static PredictorAndArgs EnsembleRegressionPerformanceSelector = new PredictorAndArgs - { - Trainer = new SubComponent("EnsembleRegression", "pt = BestPerformanceRegressionSelector"), - Tag = "PerformanceSelector", - }; - - public static PredictorAndArgs EnsembleRegressionAverageCombiner = new PredictorAndArgs - { - Trainer = new SubComponent("EnsembleRegression", "oc=Average"), - Tag = "Average", - }; - - public static PredictorAndArgs EnsembleRegressionStackingCombiner = new PredictorAndArgs - { - Trainer = new SubComponent("EnsembleRegression", "oc=RegressionStacking"), - Tag = "RegressionStacking", - }; - } } \ No newline at end of file From cd28d3d88aeaca6247e33b7c973a1a350ee6ff04 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 26 Jun 2018 12:07:21 -0700 Subject: [PATCH 15/20] Address comments --- .../EntryPoints/Ensemble.cs | 6 +- .../OutputCombiners/RegressionStacking.cs | 6 +- src/Microsoft.ML/CSharpApi.cs | 282 ++++++------ .../Common/EntryPoints/core_ep-list.tsv | 6 +- .../Common/EntryPoints/core_manifest.json | 408 +++++++++--------- 5 files changed, 355 insertions(+), 353 deletions(-) diff --git a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs index c49e393a12..728cccb1f6 100644 --- a/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs +++ b/src/Microsoft.ML.Ensemble/EntryPoints/Ensemble.cs @@ -13,7 +13,7 @@ namespace Microsoft.ML.Ensemble.EntryPoints { public static class Ensemble { - [TlcModule.EntryPoint(Name = "Trainers.BinaryClassifierEnsemble", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.EnsembleBinaryClassifier", Desc = "Train binary ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHostEnvironment env, EnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); @@ -26,7 +26,7 @@ public static CommonOutputs.BinaryClassificationOutput CreateBinaryEnsemble(IHos () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); } - [TlcModule.EntryPoint(Name = "Trainers.ClassificationEnsemble", Desc = "Train multiclass ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.EnsembleClassification", Desc = "Train multiclass ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassEnsemble(IHostEnvironment env, MulticlassDataPartitionEnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); @@ -39,7 +39,7 @@ public static CommonOutputs.MulticlassClassificationOutput CreateMultiClassEnsem () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn)); } - [TlcModule.EntryPoint(Name = "Trainers.RegressionEnsemble", Desc = "Train regression ensemble.", UserName = EnsembleTrainer.UserNameValue)] + [TlcModule.EntryPoint(Name = "Trainers.EnsembleRegression", Desc = "Train regression ensemble.", UserName = EnsembleTrainer.UserNameValue)] public static CommonOutputs.RegressionOutput CreateRegressionEnsemble(IHostEnvironment env, RegressionEnsembleTrainer.Arguments input) { Contracts.CheckValue(env, nameof(env)); diff --git a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs index 65673f4d5b..aeb011a51b 100644 --- a/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs +++ b/src/Microsoft.ML.Ensemble/OutputCombiners/RegressionStacking.cs @@ -10,13 +10,15 @@ using Microsoft.ML.Runtime.Model; [assembly: LoadableClass(typeof(RegressionStacking), typeof(RegressionStacking.Arguments), typeof(SignatureCombiner), - Stacking.UserName, RegressionStacking.LoaderSignature)] -[assembly: LoadableClass(typeof(RegressionStacking), null, typeof(SignatureLoadModel), Stacking.UserName, RegressionStacking.LoadName)] +[assembly: LoadableClass(typeof(RegressionStacking), null, typeof(SignatureLoadModel), + Stacking.UserName, RegressionStacking.LoaderSignature)] + namespace Microsoft.ML.Runtime.Ensemble.OutputCombiners { using TScalarPredictor = IPredictorProducing; + public sealed class RegressionStacking : BaseScalarStacking, IRegressionOutputCombiner, ICanSaveModel { public const string LoadName = "RegressionStacking"; diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index edfb967d1f..8c3a648ad2 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -490,28 +490,40 @@ public void Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input, _jsonNodes.Add(Serialize("Trainers.AveragedPerceptronBinaryClassifier", input, output)); } - public Microsoft.ML.Trainers.BinaryClassifierEnsemble.Output Add(Microsoft.ML.Trainers.BinaryClassifierEnsemble input) + public Microsoft.ML.Trainers.EnsembleBinaryClassifier.Output Add(Microsoft.ML.Trainers.EnsembleBinaryClassifier input) { - var output = new Microsoft.ML.Trainers.BinaryClassifierEnsemble.Output(); + var output = new Microsoft.ML.Trainers.EnsembleBinaryClassifier.Output(); Add(input, output); return output; } - public void Add(Microsoft.ML.Trainers.BinaryClassifierEnsemble input, Microsoft.ML.Trainers.BinaryClassifierEnsemble.Output output) + public void Add(Microsoft.ML.Trainers.EnsembleBinaryClassifier input, Microsoft.ML.Trainers.EnsembleBinaryClassifier.Output output) { - _jsonNodes.Add(Serialize("Trainers.BinaryClassifierEnsemble", input, output)); + _jsonNodes.Add(Serialize("Trainers.EnsembleBinaryClassifier", input, output)); } - public Microsoft.ML.Trainers.ClassificationEnsemble.Output Add(Microsoft.ML.Trainers.ClassificationEnsemble input) + public Microsoft.ML.Trainers.EnsembleClassification.Output Add(Microsoft.ML.Trainers.EnsembleClassification input) { - var output = new Microsoft.ML.Trainers.ClassificationEnsemble.Output(); + var output = new Microsoft.ML.Trainers.EnsembleClassification.Output(); Add(input, output); return output; } - public void Add(Microsoft.ML.Trainers.ClassificationEnsemble input, Microsoft.ML.Trainers.ClassificationEnsemble.Output output) + public void Add(Microsoft.ML.Trainers.EnsembleClassification input, Microsoft.ML.Trainers.EnsembleClassification.Output output) { - _jsonNodes.Add(Serialize("Trainers.ClassificationEnsemble", input, output)); + _jsonNodes.Add(Serialize("Trainers.EnsembleClassification", input, output)); + } + + public Microsoft.ML.Trainers.EnsembleRegression.Output Add(Microsoft.ML.Trainers.EnsembleRegression input) + { + var output = new Microsoft.ML.Trainers.EnsembleRegression.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.EnsembleRegression input, Microsoft.ML.Trainers.EnsembleRegression.Output output) + { + _jsonNodes.Add(Serialize("Trainers.EnsembleRegression", input, output)); } public Microsoft.ML.Trainers.FastForestBinaryClassifier.Output Add(Microsoft.ML.Trainers.FastForestBinaryClassifier input) @@ -706,18 +718,6 @@ public void Add(Microsoft.ML.Trainers.PoissonRegressor input, Microsoft.ML.Train _jsonNodes.Add(Serialize("Trainers.PoissonRegressor", input, output)); } - public Microsoft.ML.Trainers.RegressionEnsemble.Output Add(Microsoft.ML.Trainers.RegressionEnsemble input) - { - var output = new Microsoft.ML.Trainers.RegressionEnsemble.Output(); - Add(input, output); - return output; - } - - public void Add(Microsoft.ML.Trainers.RegressionEnsemble input, Microsoft.ML.Trainers.RegressionEnsemble.Output output) - { - _jsonNodes.Add(Serialize("Trainers.RegressionEnsemble", input, output)); - } - public Microsoft.ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier.Output Add(Microsoft.ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier input) { var output = new Microsoft.ML.Trainers.StochasticDualCoordinateAscentBinaryClassifier.Output(); @@ -4177,7 +4177,7 @@ namespace Trainers /// /// Train binary ensemble. /// - public sealed partial class BinaryClassifierEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class EnsembleBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { @@ -4261,18 +4261,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(BinaryClassifierEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(EnsembleBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new BinaryClassifierEnsemblePipelineStep(output); + return new EnsembleBinaryClassifierPipelineStep(output); } - private class BinaryClassifierEnsemblePipelineStep : ILearningPipelinePredictorStep + private class EnsembleBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public BinaryClassifierEnsemblePipelineStep(Output output) + public EnsembleBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -4288,7 +4288,7 @@ namespace Trainers /// /// Train multiclass ensemble. /// - public sealed partial class ClassificationEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class EnsembleClassification : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { @@ -4372,18 +4372,129 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ClassificationEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(EnsembleClassification)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new EnsembleClassificationPipelineStep(output); + } + + private class EnsembleClassificationPipelineStep : ILearningPipelinePredictorStep + { + public EnsembleClassificationPipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + + namespace Trainers + { + + /// + /// Train regression ensemble. + /// + public sealed partial class EnsembleRegression : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Algorithm to prune the base learners for selective Ensemble + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleRegressionSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleRegressionSubModelSelector(); + + /// + /// Output combiner + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleRegressionOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleRegressionOutputCombiner(); + + /// + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. + /// + public int? NumModels { get; set; } + + /// + /// Batch size + /// + public int BatchSize { get; set; } = -1; + + /// + /// Sampling Type + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); + + /// + /// All the base learners will run asynchronously if the value is true + /// + public bool TrainParallel { get; set; } = false; + + /// + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set + /// + public bool ShowMetrics { get; set; } = false; + + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(EnsembleRegression)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new ClassificationEnsemblePipelineStep(output); + return new EnsembleRegressionPipelineStep(output); } - private class ClassificationEnsemblePipelineStep : ILearningPipelinePredictorStep + private class EnsembleRegressionPipelineStep : ILearningPipelinePredictorStep { - public ClassificationEnsemblePipelineStep(Output output) + public EnsembleRegressionPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7889,117 +8000,6 @@ public PoissonRegressorPipelineStep(Output output) } } - namespace Trainers - { - - /// - /// Train regression ensemble. - /// - public sealed partial class RegressionEnsemble : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem - { - - - /// - /// Algorithm to prune the base learners for selective Ensemble - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleRegressionSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleRegressionSubModelSelector(); - - /// - /// Output combiner - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleRegressionOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleRegressionOutputCombiner(); - - /// - /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. - /// - public int? NumModels { get; set; } - - /// - /// Batch size - /// - public int BatchSize { get; set; } = -1; - - /// - /// Sampling Type - /// - [JsonConverter(typeof(ComponentSerializer))] - public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); - - /// - /// All the base learners will run asynchronously if the value is true - /// - public bool TrainParallel { get; set; } = false; - - /// - /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set - /// - public bool ShowMetrics { get; set; } = false; - - /// - /// Column to use for labels - /// - public string LabelColumn { get; set; } = "Label"; - - /// - /// The data to be used for training - /// - public Var TrainingData { get; set; } = new Var(); - - /// - /// Column to use for features - /// - public string FeatureColumn { get; set; } = "Features"; - - /// - /// Normalize option for the feature column - /// - public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; - - /// - /// Whether learner should cache input training data - /// - public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - - - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput - { - /// - /// The trained model - /// - public Var PredictorModel { get; set; } = new Var(); - - } - public Var GetInputData() => TrainingData; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(RegressionEnsemble)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - TrainingData = dataStep.Data; - } - Output output = experiment.Add(this); - return new RegressionEnsemblePipelineStep(output); - } - - private class RegressionEnsemblePipelineStep : ILearningPipelinePredictorStep - { - public RegressionEnsemblePipelineStep(Output output) - { - Model = output.PredictorModel; - } - - public Var Model { get; } - } - } - } - namespace Trainers { diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 9b86202085..632539a526 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -37,8 +37,9 @@ Models.SweepResultExtractor Extracts the sweep result. Microsoft.ML.Runtime.Entr Models.TrainTestBinaryEvaluator Train test for binary classification Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro TrainTestBinary Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestBinaryMacro+Output] Models.TrainTestEvaluator General train test for any supported evaluator Microsoft.ML.Runtime.EntryPoints.TrainTestMacro TrainTest Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MacroOutput`1[Microsoft.ML.Runtime.EntryPoints.TrainTestMacro+Output] Trainers.AveragedPerceptronBinaryClassifier Train a Average perceptron. Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer TrainBinary Microsoft.ML.Runtime.Learners.AveragedPerceptronTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput -Trainers.BinaryClassifierEnsemble Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput -Trainers.ClassificationEnsemble Train multiclass ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateMultiClassEnsemble Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput +Trainers.EnsembleBinaryClassifier Train binary ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateBinaryEnsemble Microsoft.ML.Runtime.Ensemble.EnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput +Trainers.EnsembleClassification Train multiclass ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateMultiClassEnsemble Microsoft.ML.Runtime.Ensemble.MulticlassDataPartitionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput +Trainers.EnsembleRegression Train regression ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateRegressionEnsemble Microsoft.ML.Runtime.Ensemble.RegressionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.FastForestBinaryClassifier Uses a random forest learner to perform binary classification. Microsoft.ML.Runtime.FastTree.FastForest TrainBinary Microsoft.ML.Runtime.FastTree.FastForestClassification+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.FastForestRegressor Trains a random forest to fit target values using least-squares. Microsoft.ML.Runtime.FastTree.FastForest TrainRegression Microsoft.ML.Runtime.FastTree.FastForestRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.FastTreeBinaryClassifier Uses a logit-boost boosted tree learner to perform binary classification. Microsoft.ML.Runtime.FastTree.FastTree TrainBinary Microsoft.ML.Runtime.FastTree.FastTreeBinaryClassificationTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput @@ -55,7 +56,6 @@ Trainers.NaiveBayesClassifier Train a MultiClassNaiveBayesTrainer. Microsoft.ML. Trainers.OnlineGradientDescentRegressor Train a Online gradient descent perceptron. Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer TrainRegression Microsoft.ML.Runtime.Learners.OnlineGradientDescentTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Runtime.PCA.RandomizedPcaTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+AnomalyDetectionOutput Trainers.PoissonRegressor Train an Poisson regression model. Microsoft.ML.Runtime.Learners.PoissonRegression TrainRegression Microsoft.ML.Runtime.Learners.PoissonRegression+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput -Trainers.RegressionEnsemble Train regression ensemble. Microsoft.ML.Ensemble.EntryPoints.Ensemble CreateRegressionEnsemble Microsoft.ML.Runtime.Ensemble.RegressionEnsembleTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput Trainers.StochasticDualCoordinateAscentBinaryClassifier Train an SDCA binary model. Microsoft.ML.Runtime.Learners.Sdca TrainBinary Microsoft.ML.Runtime.Learners.LinearClassificationTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.StochasticDualCoordinateAscentClassifier Train an SDCA multi class model Microsoft.ML.Runtime.Learners.Sdca TrainMultiClass Microsoft.ML.Runtime.Learners.SdcaMultiClassTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+MulticlassClassificationOutput Trainers.StochasticDualCoordinateAscentRegressor Train an SDCA regression model Microsoft.ML.Runtime.Learners.Sdca TrainRegression Microsoft.ML.Runtime.Learners.SdcaRegressionTrainer+Arguments Microsoft.ML.Runtime.EntryPoints.CommonOutputs+RegressionOutput diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index 1874a35d3a..bf3f9a28bc 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -4065,7 +4065,7 @@ ] }, { - "Name": "Trainers.BinaryClassifierEnsemble", + "Name": "Trainers.EnsembleBinaryClassifier", "Desc": "Train binary ensemble.", "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", "ShortName": null, @@ -4267,7 +4267,7 @@ ] }, { - "Name": "Trainers.ClassificationEnsemble", + "Name": "Trainers.EnsembleClassification", "Desc": "Train multiclass ensemble.", "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", "ShortName": null, @@ -4468,6 +4468,208 @@ "ITrainerOutput" ] }, + { + "Name": "Trainers.EnsembleRegression", + "Desc": "Train regression ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", + "Aliases": [ + "st" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "NumModels", + "Type": "Int", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", + "Aliases": [ + "nm" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", + "Aliases": [ + "pt" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": { + "Name": "AllSelector" + } + }, + { + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionOutputCombiner" + }, + "Desc": "Output combiner", + "Aliases": [ + "oc" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": { + "Name": "Median" + } + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "TrainParallel", + "Type": "Bool", + "Desc": "All the base learners will run asynchronously if the value is true", + "Aliases": [ + "tp" + ], + "Required": false, + "SortOrder": 106.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", + "Aliases": [ + "bs" + ], + "Required": false, + "SortOrder": 107.0, + "IsNullable": false, + "Default": -1 + }, + { + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", + "Aliases": [ + "sm" + ], + "Required": false, + "SortOrder": 108.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, { "Name": "Trainers.FastForestBinaryClassifier", "Desc": "Uses a random forest learner to perform binary classification.", @@ -12012,208 +12214,6 @@ "ITrainerOutput" ] }, - { - "Name": "Trainers.RegressionEnsemble", - "Desc": "Train regression ensemble.", - "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", - "ShortName": null, - "Inputs": [ - { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "SamplingType", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleSubsetSelector" - }, - "Desc": "Sampling Type", - "Aliases": [ - "st" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": { - "Name": "BootstrapSelector", - "Settings": { - "FeatureSelector": { - "Name": "AllFeatureSelector" - } - } - } - }, - { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", - "Aliases": [ - "feat" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": "Features" - }, - { - "Name": "NumModels", - "Type": "Int", - "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", - "Aliases": [ - "nm" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", - "Aliases": [ - "lab" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": "Label" - }, - { - "Name": "SubModelSelectorType", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleRegressionSubModelSelector" - }, - "Desc": "Algorithm to prune the base learners for selective Ensemble", - "Aliases": [ - "pt" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": { - "Name": "AllSelector" - } - }, - { - "Name": "OutputCombiner", - "Type": { - "Kind": "Component", - "ComponentKind": "EnsembleRegressionOutputCombiner" - }, - "Desc": "Output combiner", - "Aliases": [ - "oc" - ], - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": { - "Name": "Median" - } - }, - { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", - "Aliases": [ - "cache" - ], - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": "Auto" - }, - { - "Name": "TrainParallel", - "Type": "Bool", - "Desc": "All the base learners will run asynchronously if the value is true", - "Aliases": [ - "tp" - ], - "Required": false, - "SortOrder": 106.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "BatchSize", - "Type": "Int", - "Desc": "Batch size", - "Aliases": [ - "bs" - ], - "Required": false, - "SortOrder": 107.0, - "IsNullable": false, - "Default": -1 - }, - { - "Name": "ShowMetrics", - "Type": "Bool", - "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", - "Aliases": [ - "sm" - ], - "Required": false, - "SortOrder": 108.0, - "IsNullable": false, - "Default": false - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRegressionOutput", - "ITrainerOutput" - ] - }, { "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", "Desc": "Train an SDCA binary model.", From c54f2e41630df7f70e6b082afa1a2ecaf3377a26 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 26 Jun 2018 12:26:23 -0700 Subject: [PATCH 16/20] update csharpapi and manifest.json --- src/Microsoft.ML/CSharpApi.cs | 8220 ++++++---- .../Common/EntryPoints/core_manifest.json | 13574 +++++++++------- 2 files changed, 12296 insertions(+), 9498 deletions(-) diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index 0f8fefb267..da03694c79 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -106,6 +106,18 @@ public void Add(Microsoft.ML.Models.AnomalyDetectionEvaluator input, Microsoft.M _jsonNodes.Add(Serialize("Models.AnomalyDetectionEvaluator", input, output)); } + public Microsoft.ML.Models.AnomalyPipelineEnsemble.Output Add(Microsoft.ML.Models.AnomalyPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.AnomalyPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.AnomalyPipelineEnsemble input, Microsoft.ML.Models.AnomalyPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.AnomalyPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.BinaryClassificationEvaluator.Output Add(Microsoft.ML.Models.BinaryClassificationEvaluator input) { var output = new Microsoft.ML.Models.BinaryClassificationEvaluator.Output(); @@ -130,6 +142,30 @@ public void Add(Microsoft.ML.Models.BinaryCrossValidator input, Microsoft.ML.Mod _jsonNodes.Add(Serialize("Models.BinaryCrossValidator", input, output)); } + public Microsoft.ML.Models.BinaryEnsemble.Output Add(Microsoft.ML.Models.BinaryEnsemble input) + { + var output = new Microsoft.ML.Models.BinaryEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.BinaryEnsemble input, Microsoft.ML.Models.BinaryEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.BinaryEnsemble", input, output)); + } + + public Microsoft.ML.Models.BinaryPipelineEnsemble.Output Add(Microsoft.ML.Models.BinaryPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.BinaryPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.BinaryPipelineEnsemble input, Microsoft.ML.Models.BinaryPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.BinaryPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.ClassificationEvaluator.Output Add(Microsoft.ML.Models.ClassificationEvaluator input) { var output = new Microsoft.ML.Models.ClassificationEvaluator.Output(); @@ -202,6 +238,18 @@ public void Add(Microsoft.ML.Models.DatasetTransformer input, Microsoft.ML.Model _jsonNodes.Add(Serialize("Models.DatasetTransformer", input, output)); } + public Microsoft.ML.Models.EnsembleSummary.Output Add(Microsoft.ML.Models.EnsembleSummary input) + { + var output = new Microsoft.ML.Models.EnsembleSummary.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.EnsembleSummary input, Microsoft.ML.Models.EnsembleSummary.Output output) + { + _jsonNodes.Add(Serialize("Models.EnsembleSummary", input, output)); + } + public Microsoft.ML.Models.FixedPlattCalibrator.Output Add(Microsoft.ML.Models.FixedPlattCalibrator input) { var output = new Microsoft.ML.Models.FixedPlattCalibrator.Output(); @@ -214,6 +262,18 @@ public void Add(Microsoft.ML.Models.FixedPlattCalibrator input, Microsoft.ML.Mod _jsonNodes.Add(Serialize("Models.FixedPlattCalibrator", input, output)); } + public Microsoft.ML.Models.MultiClassPipelineEnsemble.Output Add(Microsoft.ML.Models.MultiClassPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.MultiClassPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.MultiClassPipelineEnsemble input, Microsoft.ML.Models.MultiClassPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.MultiClassPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.MultiOutputRegressionEvaluator.Output Add(Microsoft.ML.Models.MultiOutputRegressionEvaluator input) { var output = new Microsoft.ML.Models.MultiOutputRegressionEvaluator.Output(); @@ -334,6 +394,18 @@ public void Add(Microsoft.ML.Models.RankerEvaluator input, Microsoft.ML.Models.R _jsonNodes.Add(Serialize("Models.RankerEvaluator", input, output)); } + public Microsoft.ML.Models.RegressionEnsemble.Output Add(Microsoft.ML.Models.RegressionEnsemble input) + { + var output = new Microsoft.ML.Models.RegressionEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.RegressionEnsemble input, Microsoft.ML.Models.RegressionEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.RegressionEnsemble", input, output)); + } + public Microsoft.ML.Models.RegressionEvaluator.Output Add(Microsoft.ML.Models.RegressionEvaluator input) { var output = new Microsoft.ML.Models.RegressionEvaluator.Output(); @@ -346,6 +418,18 @@ public void Add(Microsoft.ML.Models.RegressionEvaluator input, Microsoft.ML.Mode _jsonNodes.Add(Serialize("Models.RegressionEvaluator", input, output)); } + public Microsoft.ML.Models.RegressionPipelineEnsemble.Output Add(Microsoft.ML.Models.RegressionPipelineEnsemble input) + { + var output = new Microsoft.ML.Models.RegressionPipelineEnsemble.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Models.RegressionPipelineEnsemble input, Microsoft.ML.Models.RegressionPipelineEnsemble.Output output) + { + _jsonNodes.Add(Serialize("Models.RegressionPipelineEnsemble", input, output)); + } + public Microsoft.ML.Models.Summarizer.Output Add(Microsoft.ML.Models.Summarizer input) { var output = new Microsoft.ML.Models.Summarizer.Output(); @@ -406,6 +490,42 @@ public void Add(Microsoft.ML.Trainers.AveragedPerceptronBinaryClassifier input, _jsonNodes.Add(Serialize("Trainers.AveragedPerceptronBinaryClassifier", input, output)); } + public Microsoft.ML.Trainers.EnsembleBinaryClassifier.Output Add(Microsoft.ML.Trainers.EnsembleBinaryClassifier input) + { + var output = new Microsoft.ML.Trainers.EnsembleBinaryClassifier.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.EnsembleBinaryClassifier input, Microsoft.ML.Trainers.EnsembleBinaryClassifier.Output output) + { + _jsonNodes.Add(Serialize("Trainers.EnsembleBinaryClassifier", input, output)); + } + + public Microsoft.ML.Trainers.EnsembleClassification.Output Add(Microsoft.ML.Trainers.EnsembleClassification input) + { + var output = new Microsoft.ML.Trainers.EnsembleClassification.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.EnsembleClassification input, Microsoft.ML.Trainers.EnsembleClassification.Output output) + { + _jsonNodes.Add(Serialize("Trainers.EnsembleClassification", input, output)); + } + + public Microsoft.ML.Trainers.EnsembleRegression.Output Add(Microsoft.ML.Trainers.EnsembleRegression input) + { + var output = new Microsoft.ML.Trainers.EnsembleRegression.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.EnsembleRegression input, Microsoft.ML.Trainers.EnsembleRegression.Output output) + { + _jsonNodes.Add(Serialize("Trainers.EnsembleRegression", input, output)); + } + public Microsoft.ML.Trainers.FastForestBinaryClassifier.Output Add(Microsoft.ML.Trainers.FastForestBinaryClassifier input) { var output = new Microsoft.ML.Trainers.FastForestBinaryClassifier.Output(); @@ -1808,6 +1928,44 @@ public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IEva } } + namespace Models + { + public enum EnsembleCreatorScoreCombiner + { + Median = 0, + Average = 1 + } + + + /// + /// Combine anomaly detection models into an ensemble + /// + public sealed partial class AnomalyPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorScoreCombiner ModelCombiner { get; set; } = EnsembleCreatorScoreCombiner.Average; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -1995,6 +2153,82 @@ public sealed class Output } } + namespace Models + { + public enum EnsembleCreatorClassifierCombiner + { + Median = 0, + Average = 1, + Vote = 2 + } + + + /// + /// Combine binary classifiers into an ensemble + /// + public sealed partial class BinaryEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorClassifierCombiner ModelCombiner { get; set; } = EnsembleCreatorClassifierCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + /// + /// Whether to validate that all the pipelines are identical + /// + public bool ValidatePipelines { get; set; } = true; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + + namespace Models + { + + /// + /// Combine binary classification models into an ensemble + /// + public sealed partial class BinaryPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorClassifierCombiner ModelCombiner { get; set; } = EnsembleCreatorClassifierCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -2473,6 +2707,38 @@ public DatasetTransformerPipelineStep(Output output) } } + namespace Models + { + + /// + /// Summarize a pipeline ensemble predictor. + /// + public sealed partial class EnsembleSummary + { + + + /// + /// The predictor to summarize + /// + public Var PredictorModel { get; set; } = new Var(); + + + public sealed class Output + { + /// + /// The summaries of the individual predictors + /// + public ArrayVar Summaries { get; set; } = new ArrayVar(); + + /// + /// The model statistics of the individual predictors + /// + public ArrayVar Stats { get; set; } = new ArrayVar(); + + } + } + } + namespace Models { @@ -2547,6 +2813,38 @@ public FixedPlattCalibratorPipelineStep(Output output) } } + namespace Models + { + + /// + /// Combine multiclass classifiers into an ensemble + /// + public sealed partial class MultiClassPipelineEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorClassifierCombiner ModelCombiner { get; set; } = EnsembleCreatorClassifierCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -3283,6 +3581,43 @@ public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IEva } } + namespace Models + { + + /// + /// Combine regression models into an ensemble + /// + public sealed partial class RegressionEnsemble + { + + + /// + /// The combiner used to combine the scores + /// + public EnsembleCreatorScoreCombiner ModelCombiner { get; set; } = EnsembleCreatorScoreCombiner.Median; + + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); + + /// + /// Whether to validate that all the pipelines are identical + /// + public bool ValidatePipelines { get; set; } = true; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + namespace Models { @@ -3355,20 +3690,52 @@ namespace Models { /// - /// Summarize a linear regression predictor. + /// Combine regression models into an ensemble /// - public sealed partial class Summarizer + public sealed partial class RegressionPipelineEnsemble { /// - /// The predictor to summarize + /// The combiner used to combine the scores /// - public Var PredictorModel { get; set; } = new Var(); + public EnsembleCreatorScoreCombiner ModelCombiner { get; set; } = EnsembleCreatorScoreCombiner.Median; + /// + /// The models to combine into an ensemble + /// + public ArrayVar Models { get; set; } = new ArrayVar(); - public sealed class Output - { + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + + namespace Models + { + + /// + /// Summarize a linear regression predictor. + /// + public sealed partial class Summarizer + { + + + /// + /// The predictor to summarize + /// + public Var PredictorModel { get; set; } = new Var(); + + + public sealed class Output + { /// /// The summary of a predictor /// @@ -3818,240 +4185,273 @@ public AveragedPerceptronBinaryClassifierPipelineStep(Output output) namespace Trainers { - public enum Bundle : byte - { - None = 0, - AggregateLowPopulation = 1, - Adjacent = 2 - } - /// - /// Uses a random forest learner to perform binary classification. + /// Train binary ensemble. /// - public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class EnsembleBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Upper bound on absolute value of single tree output + /// Algorithm to prune the base learners for selective Ensemble /// - public double MaxTreeOutput { get; set; } = 100d; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleBinarySubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleBinarySubModelSelector(); /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration + /// Output combiner /// [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); + public EnsembleBinaryOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleBinaryOutputCombiner(); /// - /// The maximum number of examples to use when training the calibrator + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. /// - public int MaxCalibrationExamples { get; set; } = 1000000; + public int? NumModels { get; set; } /// - /// Number of labels to be sampled from each leaf to make the distribtuion + /// Batch size /// - public int QuantileSampleCount { get; set; } = 100; + public int BatchSize { get; set; } = -1; /// - /// Allows to choose Parallel FastTree Learning Algorithm + /// Sampling Type /// [JsonConverter(typeof(ComponentSerializer))] - public ParallelTraining ParallelTrainer { get; set; } = new SingleParallelTraining(); + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); /// - /// The number of threads to use + /// All the base learners will run asynchronously if the value is true /// - public int? NumThreads { get; set; } + public bool TrainParallel { get; set; } = false; /// - /// The seed of the random number generator + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set /// - public int RngSeed { get; set; } = 123; + public bool ShowMetrics { get; set; } = false; /// - /// The seed of the active feature selection + /// Column to use for labels /// - public int FeatureSelectSeed { get; set; } = 123; + public string LabelColumn { get; set; } = "Label"; /// - /// The entropy (regularization) coefficient between 0 and 1 + /// The data to be used for training /// - public double EntropyCoefficient { get; set; } + public Var TrainingData { get; set; } = new Var(); /// - /// The number of histograms in the pool (between 2 and numLeaves) + /// Column to use for features /// - public int HistogramPoolSize { get; set; } = -1; + public string FeatureColumn { get; set; } = "Features"; /// - /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose + /// Normalize option for the feature column /// - public bool? DiskTranspose { get; set; } + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// - /// Whether to collectivize features during dataset preparation to speed up training + /// Whether learner should cache input training data /// - public bool FeatureFlocks { get; set; } = true; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - /// - /// Whether to do split based on multiple categorical feature values. - /// - public bool CategoricalSplit { get; set; } = false; - /// - /// Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features. - /// - public int MaxCategoricalGroupsPerNode { get; set; } = 64; + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); - /// - /// Maximum categorical split points to consider when splitting on a categorical feature. - /// - public int MaxCategoricalSplitPoints { get; set; } = 64; + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(EnsembleBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } - /// - /// Minimum categorical docs percentage in a bin to consider for a split. - /// - public double MinDocsPercentageForCategoricalSplit { get; set; } = 0.001d; + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new EnsembleBinaryClassifierPipelineStep(output); + } - /// - /// Minimum categorical doc count in a bin to consider for a split. - /// - public int MinDocsForCategoricalSplit { get; set; } = 100; + private class EnsembleBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + { + public EnsembleBinaryClassifierPipelineStep(Output output) + { + Model = output.PredictorModel; + } - /// - /// Bias for calculating gradient for each feature bin for a categorical feature. - /// - public double Bias { get; set; } + public Var Model { get; } + } + } + } - /// - /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. - /// - public Bundle Bundling { get; set; } = Bundle.None; + namespace Trainers + { - /// - /// Maximum number of distinct values (bins) per feature - /// - public int MaxBins { get; set; } = 255; + /// + /// Train multiclass ensemble. + /// + public sealed partial class EnsembleClassification : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { - /// - /// Sparsity level needed to use sparse feature representation - /// - public double SparsifyThreshold { get; set; } = 0.7d; /// - /// The feature first use penalty coefficient + /// Algorithm to prune the base learners for selective Ensemble /// - public double FeatureFirstUsePenalty { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleMulticlassSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorMultiClassEnsembleMulticlassSubModelSelector(); /// - /// The feature re-use penalty (regularization) coefficient + /// Output combiner /// - public double FeatureReusePenalty { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleMulticlassOutputCombiner OutputCombiner { get; set; } = new MultiMedianEnsembleMulticlassOutputCombiner(); /// - /// Tree fitting gain confidence requirement (should be in the range [0,1) ). + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. /// - public double GainConfidenceLevel { get; set; } + public int? NumModels { get; set; } /// - /// The temperature of the randomized softmax distribution for choosing the feature + /// Batch size /// - public double SoftmaxTemperature { get; set; } + public int BatchSize { get; set; } = -1; /// - /// Print execution time breakdown to stdout + /// Sampling Type /// - public bool ExecutionTimes { get; set; } = false; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); /// - /// The max number of leaves in each regression tree + /// All the base learners will run asynchronously if the value is true /// - [TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, stepSize:4, isLogScale:true)] - public int NumLeaves { get; set; } = 20; + public bool TrainParallel { get; set; } = false; /// - /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set /// - [TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[]{1, 10, 50})] - public int MinDocumentsInLeafs { get; set; } = 10; + public bool ShowMetrics { get; set; } = false; /// - /// Total number of decision trees to create in the ensemble + /// Column to use for labels /// - [TlcModule.SweepableDiscreteParamAttribute("NumTrees", new object[]{20, 100, 500})] - public int NumTrees { get; set; } = 100; + public string LabelColumn { get; set; } = "Label"; /// - /// The fraction of features (chosen randomly) to use on each iteration + /// The data to be used for training /// - public double FeatureFraction { get; set; } = 0.7d; + public Var TrainingData { get; set; } = new Var(); /// - /// Number of trees in each bag (0 for disabling bagging) + /// Column to use for features /// - public int BaggingSize { get; set; } = 1; + public string FeatureColumn { get; set; } = "Features"; /// - /// Percentage of training examples used in each bag + /// Normalize option for the feature column /// - public double BaggingTrainFraction { get; set; } = 0.7d; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// - /// The fraction of features (chosen randomly) to use on each split + /// Whether learner should cache input training data /// - public double SplitFraction { get; set; } = 0.7d; + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - /// - /// Smoothing paramter for tree regularization - /// - public double Smoothing { get; set; } - /// - /// When a root split is impossible, allow training to proceed - /// - public bool AllowEmptyTrees { get; set; } = true; + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public Var GetInputData() => TrainingData; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(EnsembleClassification)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + } + Output output = experiment.Add(this); + return new EnsembleClassificationPipelineStep(output); + } + + private class EnsembleClassificationPipelineStep : ILearningPipelinePredictorStep + { + public EnsembleClassificationPipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + + namespace Trainers + { + + /// + /// Train regression ensemble. + /// + public sealed partial class EnsembleRegression : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { - /// - /// The level of feature compression to use - /// - public int FeatureCompressionLevel { get; set; } = 1; /// - /// Compress the tree Ensemble + /// Algorithm to prune the base learners for selective Ensemble /// - public bool CompressEnsemble { get; set; } = false; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleRegressionSubModelSelector SubModelSelectorType { get; set; } = new AllSelectorEnsembleRegressionSubModelSelector(); /// - /// Maximum Number of trees after compression + /// Output combiner /// - public int MaxTreesAfterCompression { get; set; } = -1; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleRegressionOutputCombiner OutputCombiner { get; set; } = new MedianEnsembleRegressionOutputCombiner(); /// - /// Print metrics graph for the first test set + /// Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise. /// - public bool PrintTestGraph { get; set; } = false; + public int? NumModels { get; set; } /// - /// Print Train and Validation metrics in graph + /// Batch size /// - public bool PrintTrainValidGraph { get; set; } = false; + public int BatchSize { get; set; } = -1; /// - /// Calculate metric values for train/valid/test every k rounds + /// Sampling Type /// - public int TestFrequency { get; set; } = 2147483647; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleSubsetSelector SamplingType { get; set; } = new BootstrapSelectorEnsembleSubsetSelector(); /// - /// Column to use for example groupId + /// All the base learners will run asynchronously if the value is true /// - public Microsoft.ML.Runtime.EntryPoints.Optional GroupIdColumn { get; set; } + public bool TrainParallel { get; set; } = false; /// - /// Column to use for example weight + /// True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set /// - public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } + public bool ShowMetrics { get; set; } = false; /// /// Column to use for labels @@ -4079,7 +4479,7 @@ public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.En public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -4095,18 +4495,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FastForestBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(EnsembleRegression)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new FastForestBinaryClassifierPipelineStep(output); + return new EnsembleRegressionPipelineStep(output); } - private class FastForestBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class EnsembleRegressionPipelineStep : ILearningPipelinePredictorStep { - public FastForestBinaryClassifierPipelineStep(Output output) + public EnsembleRegressionPipelineStep(Output output) { Model = output.PredictorModel; } @@ -4118,18 +4518,36 @@ public FastForestBinaryClassifierPipelineStep(Output output) namespace Trainers { + public enum Bundle : byte + { + None = 0, + AggregateLowPopulation = 1, + Adjacent = 2 + } + /// - /// Trains a random forest to fit target values using least-squares. + /// Uses a random forest learner to perform binary classification. /// - public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastForestBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass. + /// Upper bound on absolute value of single tree output /// - public bool ShuffleLabels { get; set; } = false; + public double MaxTreeOutput { get; set; } = 100d; + + /// + /// The calibrator kind to apply to the predictor. Specify null for no calibration + /// + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); + + /// + /// The maximum number of examples to use when training the calibrator + /// + public int MaxCalibrationExamples { get; set; } = 1000000; /// /// Number of labels to be sampled from each leaf to make the distribtuion @@ -4361,7 +4779,7 @@ public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoin public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -4377,18 +4795,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FastForestRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FastForestBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new FastForestRegressorPipelineStep(output); + return new FastForestBinaryClassifierPipelineStep(output); } - private class FastForestRegressorPipelineStep : ILearningPipelinePredictorStep + private class FastForestBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public FastForestRegressorPipelineStep(Output output) + public FastForestBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -4400,139 +4818,23 @@ public FastForestRegressorPipelineStep(Output output) namespace Trainers { - public enum BoostedTreeArgsOptimizationAlgorithmType - { - GradientDescent = 0, - AcceleratedGradientDescent = 1, - ConjugateGradientDescent = 2 - } - /// - /// Uses a logit-boost boosted tree learner to perform binary classification. + /// Trains a random forest to fit target values using least-squares. /// - public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastForestRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Should we use derivatives optimized for unbalanced sets - /// - public bool UnbalancedSets { get; set; } = false; - - /// - /// Use best regression step trees? - /// - public bool BestStepRankingRegressionTrees { get; set; } = false; - - /// - /// Should we use line search for a step size - /// - public bool UseLineSearch { get; set; } = false; - - /// - /// Number of post-bracket line search steps - /// - public int NumPostBracketSteps { get; set; } - - /// - /// Minimum line search step size - /// - public double MinStepSize { get; set; } - - /// - /// Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) - /// - public BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; - - /// - /// Early stopping rule. (Validation set (/valid) is required.) - /// - [JsonConverter(typeof(ComponentSerializer))] - public EarlyStoppingCriterion EarlyStoppingRule { get; set; } - - /// - /// Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3) - /// - public int EarlyStoppingMetrics { get; set; } - - /// - /// Enable post-training pruning to avoid overfitting. (a validation set is required) - /// - public bool EnablePruning { get; set; } = false; - - /// - /// Use window and tolerance for pruning - /// - public bool UseTolerantPruning { get; set; } = false; - - /// - /// The tolerance threshold for pruning - /// - public double PruningThreshold { get; set; } = 0.004d; - - /// - /// The moving window size for pruning - /// - public int PruningWindowSize { get; set; } = 5; - - /// - /// The learning rate - /// - [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale:true)] - public double LearningRates { get; set; } = 0.2d; - - /// - /// Shrinkage - /// - [TlcModule.SweepableFloatParamAttribute("Shrinkage", 0.025f, 4f, isLogScale:true)] - public double Shrinkage { get; set; } = 1d; - - /// - /// Dropout rate for tree regularization - /// - [TlcModule.SweepableDiscreteParamAttribute("DropoutRate", new object[]{0f, 1E-09f, 0.05f, 0.1f, 0.2f})] - public double DropoutRate { get; set; } - - /// - /// Sample each query 1 in k times in the GetDerivatives function - /// - public int GetDerivativesSampleRate { get; set; } = 1; - - /// - /// Write the last ensemble instead of the one determined by early stopping - /// - public bool WriteLastEnsemble { get; set; } = false; - - /// - /// Upper bound on absolute value of single tree output - /// - public double MaxTreeOutput { get; set; } = 100d; - - /// - /// Training starts from random ordering (determined by /r1) - /// - public bool RandomStart { get; set; } = false; - - /// - /// Filter zero lambdas during training - /// - public bool FilterZeroLambdas { get; set; } = false; - - /// - /// Freeform defining the scores that should be used as the baseline ranker - /// - public string BaselineScoresFormula { get; set; } - - /// - /// Baseline alpha for tradeoffs of risk (0 is normal training) + /// Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass. /// - public string BaselineAlphaRisk { get; set; } + public bool ShuffleLabels { get; set; } = false; /// - /// The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position) + /// Number of labels to be sampled from each leaf to make the distribtuion /// - public string PositionDiscountFreeform { get; set; } + public int QuantileSampleCount { get; set; } = 100; /// /// Allows to choose Parallel FastTree Learning Algorithm @@ -4666,12 +4968,12 @@ public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.Entr /// /// The fraction of features (chosen randomly) to use on each iteration /// - public double FeatureFraction { get; set; } = 1d; + public double FeatureFraction { get; set; } = 0.7d; /// /// Number of trees in each bag (0 for disabling bagging) /// - public int BaggingSize { get; set; } + public int BaggingSize { get; set; } = 1; /// /// Percentage of training examples used in each bag @@ -4681,7 +4983,7 @@ public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.Entr /// /// The fraction of features (chosen randomly) to use on each split /// - public double SplitFraction { get; set; } = 1d; + public double SplitFraction { get; set; } = 0.7d; /// /// Smoothing paramter for tree regularization @@ -4759,7 +5061,7 @@ public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.Entr public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -4775,18 +5077,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FastTreeBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FastForestRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new FastTreeBinaryClassifierPipelineStep(output); + return new FastForestRegressorPipelineStep(output); } - private class FastTreeBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class FastForestRegressorPipelineStep : ILearningPipelinePredictorStep { - public FastTreeBinaryClassifierPipelineStep(Output output) + public FastForestRegressorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -4798,53 +5100,25 @@ public FastTreeBinaryClassifierPipelineStep(Output output) namespace Trainers { + public enum BoostedTreeArgsOptimizationAlgorithmType + { + GradientDescent = 0, + AcceleratedGradientDescent = 1, + ConjugateGradientDescent = 2 + } + /// - /// Trains gradient boosted decision trees to the LambdaRank quasi-gradient. + /// Uses a logit-boost boosted tree learner to perform binary classification. /// - public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastTreeBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Comma seperated list of gains associated to each relevance label. - /// - public string CustomGains { get; set; } = "0,3,7,15,31"; - - /// - /// Train DCG instead of NDCG - /// - public bool TrainDcg { get; set; } = false; - - /// - /// The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet] - /// - public string SortingAlgorithm { get; set; } = "DescendingStablePessimistic"; - - /// - /// max-NDCG truncation to use in the Lambda Mart algorithm - /// - public int LambdaMartMaxTruncation { get; set; } = 100; - - /// - /// Use shifted NDCG - /// - public bool ShiftedNdcg { get; set; } = false; - - /// - /// Cost function parameter (w/c) - /// - public char CostFunctionParam { get; set; } = 'w'; - - /// - /// Distance weight 2 adjustment to cost - /// - public bool DistanceWeight2 { get; set; } = false; - - /// - /// Normalize query lambdas + /// Should we use derivatives optimized for unbalanced sets /// - public bool NormalizeQueryLambdas { get; set; } = false; + public bool UnbalancedSets { get; set; } = false; /// /// Use best regression step trees? @@ -4880,7 +5154,7 @@ public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.Co /// /// Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3) /// - public int EarlyStoppingMetrics { get; set; } = 1; + public int EarlyStoppingMetrics { get; set; } /// /// Enable post-training pruning to avoid overfitting. (a validation set is required) @@ -5185,7 +5459,7 @@ public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.Co public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRankingOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -5201,18 +5475,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FastTreeRanker)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FastTreeBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new FastTreeRankerPipelineStep(output); + return new FastTreeBinaryClassifierPipelineStep(output); } - private class FastTreeRankerPipelineStep : ILearningPipelinePredictorStep + private class FastTreeBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public FastTreeRankerPipelineStep(Output output) + public FastTreeBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -5226,19 +5500,59 @@ namespace Trainers { /// - /// Trains gradient boosted decision trees to fit target values using least-squares. + /// Trains gradient boosted decision trees to the LambdaRank quasi-gradient. /// - public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastTreeRanker : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Use best regression step trees? + /// Comma seperated list of gains associated to each relevance label. /// - public bool BestStepRankingRegressionTrees { get; set; } = false; + public string CustomGains { get; set; } = "0,3,7,15,31"; /// - /// Should we use line search for a step size + /// Train DCG instead of NDCG + /// + public bool TrainDcg { get; set; } = false; + + /// + /// The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet] + /// + public string SortingAlgorithm { get; set; } = "DescendingStablePessimistic"; + + /// + /// max-NDCG truncation to use in the Lambda Mart algorithm + /// + public int LambdaMartMaxTruncation { get; set; } = 100; + + /// + /// Use shifted NDCG + /// + public bool ShiftedNdcg { get; set; } = false; + + /// + /// Cost function parameter (w/c) + /// + public char CostFunctionParam { get; set; } = 'w'; + + /// + /// Distance weight 2 adjustment to cost + /// + public bool DistanceWeight2 { get; set; } = false; + + /// + /// Normalize query lambdas + /// + public bool NormalizeQueryLambdas { get; set; } = false; + + /// + /// Use best regression step trees? + /// + public bool BestStepRankingRegressionTrees { get; set; } = false; + + /// + /// Should we use line search for a step size /// public bool UseLineSearch { get; set; } = false; @@ -5571,7 +5885,7 @@ public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRankingOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -5587,18 +5901,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FastTreeRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FastTreeRanker)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new FastTreeRegressorPipelineStep(output); + return new FastTreeRankerPipelineStep(output); } - private class FastTreeRegressorPipelineStep : ILearningPipelinePredictorStep + private class FastTreeRankerPipelineStep : ILearningPipelinePredictorStep { - public FastTreeRegressorPipelineStep(Output output) + public FastTreeRankerPipelineStep(Output output) { Model = output.PredictorModel; } @@ -5612,17 +5926,12 @@ namespace Trainers { /// - /// Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression. + /// Trains gradient boosted decision trees to fit target values using least-squares. /// - public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastTreeRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { - /// - /// Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss. - /// - public double Index { get; set; } = 1.5d; - /// /// Use best regression step trees? /// @@ -5657,7 +5966,7 @@ public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.Entr /// /// Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3) /// - public int EarlyStoppingMetrics { get; set; } + public int EarlyStoppingMetrics { get; set; } = 1; /// /// Enable post-training pruning to avoid overfitting. (a validation set is required) @@ -5978,18 +6287,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FastTreeTweedieRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FastTreeRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new FastTreeTweedieRegressorPipelineStep(output); + return new FastTreeRegressorPipelineStep(output); } - private class FastTreeTweedieRegressorPipelineStep : ILearningPipelinePredictorStep + private class FastTreeRegressorPipelineStep : ILearningPipelinePredictorStep { - public FastTreeTweedieRegressorPipelineStep(Output output) + public FastTreeRegressorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -6003,184 +6312,136 @@ namespace Trainers { /// - /// Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features. + /// Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression. /// - public sealed partial class GeneralizedAdditiveModelBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FastTreeTweedieRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithGroupId, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Should we use derivatives optimized for unbalanced sets + /// Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss. /// - public bool UnbalancedSets { get; set; } = false; + public double Index { get; set; } = 1.5d; /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration + /// Use best regression step trees? /// - [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); + public bool BestStepRankingRegressionTrees { get; set; } = false; /// - /// The maximum number of examples to use when training the calibrator + /// Should we use line search for a step size /// - public int MaxCalibrationExamples { get; set; } = 1000000; + public bool UseLineSearch { get; set; } = false; /// - /// The entropy (regularization) coefficient between 0 and 1 + /// Number of post-bracket line search steps /// - public double EntropyCoefficient { get; set; } + public int NumPostBracketSteps { get; set; } /// - /// Tree fitting gain confidence requirement (should be in the range [0,1) ). + /// Minimum line search step size /// - public int GainConfidenceLevel { get; set; } + public double MinStepSize { get; set; } /// - /// Total number of iterations over all features + /// Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent) /// - [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{200, 1500, 9500})] - public int NumIterations { get; set; } = 9500; + public BoostedTreeArgsOptimizationAlgorithmType OptimizationAlgorithm { get; set; } = BoostedTreeArgsOptimizationAlgorithmType.GradientDescent; /// - /// The number of threads to use + /// Early stopping rule. (Validation set (/valid) is required.) /// - public int? NumThreads { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public EarlyStoppingCriterion EarlyStoppingRule { get; set; } /// - /// The learning rate + /// Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3) /// - [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.001f, 0.1f, isLogScale:true)] - public double LearningRates { get; set; } = 0.002d; + public int EarlyStoppingMetrics { get; set; } /// - /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose + /// Enable post-training pruning to avoid overfitting. (a validation set is required) /// - public bool? DiskTranspose { get; set; } + public bool EnablePruning { get; set; } = false; /// - /// Maximum number of distinct values (bins) per feature + /// Use window and tolerance for pruning /// - public int MaxBins { get; set; } = 255; + public bool UseTolerantPruning { get; set; } = false; /// - /// Upper bound on absolute value of single output + /// The tolerance threshold for pruning /// - public double MaxOutput { get; set; } = double.PositiveInfinity; + public double PruningThreshold { get; set; } = 0.004d; /// - /// Sample each query 1 in k times in the GetDerivatives function + /// The moving window size for pruning /// - public int GetDerivativesSampleRate { get; set; } = 1; + public int PruningWindowSize { get; set; } = 5; /// - /// The seed of the random number generator + /// The learning rate /// - public int RngSeed { get; set; } = 123; + [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.025f, 0.4f, isLogScale:true)] + public double LearningRates { get; set; } = 0.2d; /// - /// Minimum number of training instances required to form a partition + /// Shrinkage /// - [TlcModule.SweepableDiscreteParamAttribute("MinDocuments", new object[]{1, 10, 50})] - public int MinDocuments { get; set; } = 10; + [TlcModule.SweepableFloatParamAttribute("Shrinkage", 0.025f, 4f, isLogScale:true)] + public double Shrinkage { get; set; } = 1d; /// - /// Whether to collectivize features during dataset preparation to speed up training + /// Dropout rate for tree regularization /// - public bool FeatureFlocks { get; set; } = true; + [TlcModule.SweepableDiscreteParamAttribute("DropoutRate", new object[]{0f, 1E-09f, 0.05f, 0.1f, 0.2f})] + public double DropoutRate { get; set; } /// - /// Column to use for example weight + /// Sample each query 1 in k times in the GetDerivatives function /// - public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } + public int GetDerivativesSampleRate { get; set; } = 1; /// - /// Column to use for labels + /// Write the last ensemble instead of the one determined by early stopping /// - public string LabelColumn { get; set; } = "Label"; + public bool WriteLastEnsemble { get; set; } = false; /// - /// The data to be used for training + /// Upper bound on absolute value of single tree output /// - public Var TrainingData { get; set; } = new Var(); + public double MaxTreeOutput { get; set; } = 100d; /// - /// Column to use for features + /// Training starts from random ordering (determined by /r1) /// - public string FeatureColumn { get; set; } = "Features"; + public bool RandomStart { get; set; } = false; /// - /// Normalize option for the feature column + /// Filter zero lambdas during training /// - public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + public bool FilterZeroLambdas { get; set; } = false; /// - /// Whether learner should cache input training data + /// Freeform defining the scores that should be used as the baseline ranker /// - public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - - - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput - { - /// - /// The trained model - /// - public Var PredictorModel { get; set; } = new Var(); - - } - public Var GetInputData() => TrainingData; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(GeneralizedAdditiveModelBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - TrainingData = dataStep.Data; - } - Output output = experiment.Add(this); - return new GeneralizedAdditiveModelBinaryClassifierPipelineStep(output); - } - - private class GeneralizedAdditiveModelBinaryClassifierPipelineStep : ILearningPipelinePredictorStep - { - public GeneralizedAdditiveModelBinaryClassifierPipelineStep(Output output) - { - Model = output.PredictorModel; - } - - public Var Model { get; } - } - } - } - - namespace Trainers - { - - /// - /// Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features. - /// - public sealed partial class GeneralizedAdditiveModelRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem - { - + public string BaselineScoresFormula { get; set; } /// - /// The entropy (regularization) coefficient between 0 and 1 + /// Baseline alpha for tradeoffs of risk (0 is normal training) /// - public double EntropyCoefficient { get; set; } + public string BaselineAlphaRisk { get; set; } /// - /// Tree fitting gain confidence requirement (should be in the range [0,1) ). + /// The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position) /// - public int GainConfidenceLevel { get; set; } + public string PositionDiscountFreeform { get; set; } /// - /// Total number of iterations over all features + /// Allows to choose Parallel FastTree Learning Algorithm /// - [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{200, 1500, 9500})] - public int NumIterations { get; set; } = 9500; + [JsonConverter(typeof(ComponentSerializer))] + public ParallelTraining ParallelTrainer { get; set; } = new SingleParallelTraining(); /// /// The number of threads to use @@ -6188,297 +6449,192 @@ public sealed partial class GeneralizedAdditiveModelRegressor : Microsoft.ML.Run public int? NumThreads { get; set; } /// - /// The learning rate + /// The seed of the random number generator /// - [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.001f, 0.1f, isLogScale:true)] - public double LearningRates { get; set; } = 0.002d; + public int RngSeed { get; set; } = 123; /// - /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose + /// The seed of the active feature selection /// - public bool? DiskTranspose { get; set; } + public int FeatureSelectSeed { get; set; } = 123; /// - /// Maximum number of distinct values (bins) per feature + /// The entropy (regularization) coefficient between 0 and 1 /// - public int MaxBins { get; set; } = 255; + public double EntropyCoefficient { get; set; } /// - /// Upper bound on absolute value of single output + /// The number of histograms in the pool (between 2 and numLeaves) /// - public double MaxOutput { get; set; } = double.PositiveInfinity; + public int HistogramPoolSize { get; set; } = -1; /// - /// Sample each query 1 in k times in the GetDerivatives function + /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose /// - public int GetDerivativesSampleRate { get; set; } = 1; + public bool? DiskTranspose { get; set; } /// - /// The seed of the random number generator + /// Whether to collectivize features during dataset preparation to speed up training /// - public int RngSeed { get; set; } = 123; + public bool FeatureFlocks { get; set; } = true; /// - /// Minimum number of training instances required to form a partition + /// Whether to do split based on multiple categorical feature values. /// - [TlcModule.SweepableDiscreteParamAttribute("MinDocuments", new object[]{1, 10, 50})] - public int MinDocuments { get; set; } = 10; + public bool CategoricalSplit { get; set; } = false; /// - /// Whether to collectivize features during dataset preparation to speed up training + /// Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features. /// - public bool FeatureFlocks { get; set; } = true; + public int MaxCategoricalGroupsPerNode { get; set; } = 64; /// - /// Column to use for example weight + /// Maximum categorical split points to consider when splitting on a categorical feature. /// - public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } + public int MaxCategoricalSplitPoints { get; set; } = 64; /// - /// Column to use for labels + /// Minimum categorical docs percentage in a bin to consider for a split. /// - public string LabelColumn { get; set; } = "Label"; + public double MinDocsPercentageForCategoricalSplit { get; set; } = 0.001d; /// - /// The data to be used for training + /// Minimum categorical doc count in a bin to consider for a split. /// - public Var TrainingData { get; set; } = new Var(); + public int MinDocsForCategoricalSplit { get; set; } = 100; /// - /// Column to use for features + /// Bias for calculating gradient for each feature bin for a categorical feature. /// - public string FeatureColumn { get; set; } = "Features"; + public double Bias { get; set; } /// - /// Normalize option for the feature column + /// Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle. /// - public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + public Bundle Bundling { get; set; } = Bundle.None; /// - /// Whether learner should cache input training data + /// Maximum number of distinct values (bins) per feature /// - public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - - - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput - { - /// - /// The trained model - /// - public Var PredictorModel { get; set; } = new Var(); - - } - public Var GetInputData() => TrainingData; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(GeneralizedAdditiveModelRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - TrainingData = dataStep.Data; - } - Output output = experiment.Add(this); - return new GeneralizedAdditiveModelRegressorPipelineStep(output); - } - - private class GeneralizedAdditiveModelRegressorPipelineStep : ILearningPipelinePredictorStep - { - public GeneralizedAdditiveModelRegressorPipelineStep(Output output) - { - Model = output.PredictorModel; - } - - public Var Model { get; } - } - } - } - - namespace Trainers - { - public enum KMeansPlusPlusTrainerInitAlgorithm - { - KMeansPlusPlus = 0, - Random = 1, - KMeansParallel = 2 - } - - - /// - /// K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers. - /// - public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem - { - + public int MaxBins { get; set; } = 255; /// - /// The number of clusters + /// Sparsity level needed to use sparse feature representation /// - [TlcModule.SweepableDiscreteParamAttribute("K", new object[]{5, 10, 20, 40})] - public int K { get; set; } = 5; + public double SparsifyThreshold { get; set; } = 0.7d; /// - /// Cluster initialization algorithm + /// The feature first use penalty coefficient /// - public KMeansPlusPlusTrainerInitAlgorithm InitAlgorithm { get; set; } = KMeansPlusPlusTrainerInitAlgorithm.KMeansParallel; + public double FeatureFirstUsePenalty { get; set; } /// - /// Tolerance parameter for trainer convergence. Lower = slower, more accurate + /// The feature re-use penalty (regularization) coefficient /// - public float OptTol { get; set; } = 1E-07f; + public double FeatureReusePenalty { get; set; } /// - /// Maximum number of iterations. + /// Tree fitting gain confidence requirement (should be in the range [0,1) ). /// - public int MaxIterations { get; set; } = 1000; + public double GainConfidenceLevel { get; set; } /// - /// Memory budget (in MBs) to use for KMeans acceleration + /// The temperature of the randomized softmax distribution for choosing the feature /// - public int AccelMemBudgetMb { get; set; } = 4096; + public double SoftmaxTemperature { get; set; } /// - /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// Print execution time breakdown to stdout /// - public int? NumThreads { get; set; } + public bool ExecutionTimes { get; set; } = false; /// - /// Column to use for example weight + /// The max number of leaves in each regression tree /// - public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } + [TlcModule.SweepableLongParamAttribute("NumLeaves", 2, 128, stepSize:4, isLogScale:true)] + public int NumLeaves { get; set; } = 20; /// - /// The data to be used for training + /// The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data /// - public Var TrainingData { get; set; } = new Var(); + [TlcModule.SweepableDiscreteParamAttribute("MinDocumentsInLeafs", new object[]{1, 10, 50})] + public int MinDocumentsInLeafs { get; set; } = 10; /// - /// Column to use for features + /// Total number of decision trees to create in the ensemble /// - public string FeatureColumn { get; set; } = "Features"; + [TlcModule.SweepableDiscreteParamAttribute("NumTrees", new object[]{20, 100, 500})] + public int NumTrees { get; set; } = 100; /// - /// Normalize option for the feature column + /// The fraction of features (chosen randomly) to use on each iteration /// - public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + public double FeatureFraction { get; set; } = 1d; /// - /// Whether learner should cache input training data + /// Number of trees in each bag (0 for disabling bagging) /// - public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - - - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IClusteringOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput - { - /// - /// The trained model - /// - public Var PredictorModel { get; set; } = new Var(); - - } - public Var GetInputData() => TrainingData; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(KMeansPlusPlusClusterer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - TrainingData = dataStep.Data; - } - Output output = experiment.Add(this); - return new KMeansPlusPlusClustererPipelineStep(output); - } - - private class KMeansPlusPlusClustererPipelineStep : ILearningPipelinePredictorStep - { - public KMeansPlusPlusClustererPipelineStep(Output output) - { - Model = output.PredictorModel; - } - - public Var Model { get; } - } - } - } - - namespace Trainers - { - - /// - /// Train a linear SVM. - /// - public sealed partial class LinearSvmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem - { + public int BaggingSize { get; set; } + /// + /// Percentage of training examples used in each bag + /// + public double BaggingTrainFraction { get; set; } = 0.7d; /// - /// Regularizer constant + /// The fraction of features (chosen randomly) to use on each split /// - [TlcModule.SweepableFloatParamAttribute("Lambda", 1E-05f, 0.1f, stepSize:10, isLogScale:true)] - public float Lambda { get; set; } = 0.001f; + public double SplitFraction { get; set; } = 1d; /// - /// Batch size + /// Smoothing paramter for tree regularization /// - public int BatchSize { get; set; } = 1; + public double Smoothing { get; set; } /// - /// Perform projection to unit-ball? Typically used with batch size > 1. + /// When a root split is impossible, allow training to proceed /// - [TlcModule.SweepableDiscreteParamAttribute("PerformProjection", new object[]{false, true})] - public bool PerformProjection { get; set; } = false; + public bool AllowEmptyTrees { get; set; } = true; /// - /// No bias + /// The level of feature compression to use /// - [TlcModule.SweepableDiscreteParamAttribute("NoBias", new object[]{false, true})] - public bool NoBias { get; set; } = false; + public int FeatureCompressionLevel { get; set; } = 1; /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration + /// Compress the tree Ensemble /// - [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); + public bool CompressEnsemble { get; set; } = false; /// - /// The maximum number of examples to use when training the calibrator + /// Maximum Number of trees after compression /// - public int MaxCalibrationExamples { get; set; } = 1000000; + public int MaxTreesAfterCompression { get; set; } = -1; /// - /// Number of iterations + /// Print metrics graph for the first test set /// - [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize:10, isLogScale:true)] - public int NumIterations { get; set; } = 1; + public bool PrintTestGraph { get; set; } = false; /// - /// Initial Weights and bias, comma-separated + /// Print Train and Validation metrics in graph /// - public string InitialWeights { get; set; } + public bool PrintTrainValidGraph { get; set; } = false; /// - /// Init weights diameter + /// Calculate metric values for train/valid/test every k rounds /// - [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] - public float InitWtsDiameter { get; set; } + public int TestFrequency { get; set; } = 2147483647; /// - /// Whether to shuffle for each training iteration + /// Column to use for example groupId /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; + public Microsoft.ML.Runtime.EntryPoints.Optional GroupIdColumn { get; set; } /// - /// Size of cache when trained in Scope + /// Column to use for example weight /// - public int StreamingCacheSize { get; set; } = 1000000; + public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } /// /// Column to use for labels @@ -6506,7 +6662,7 @@ public sealed partial class LinearSvmBinaryClassifier : Microsoft.ML.Runtime.Ent public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -6522,18 +6678,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LinearSvmBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FastTreeTweedieRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new LinearSvmBinaryClassifierPipelineStep(output); + return new FastTreeTweedieRegressorPipelineStep(output); } - private class LinearSvmBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class FastTreeTweedieRegressorPipelineStep : ILearningPipelinePredictorStep { - public LinearSvmBinaryClassifierPipelineStep(Output output) + public FastTreeTweedieRegressorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -6547,83 +6703,90 @@ namespace Trainers { /// - /// Train a logistic regression binary model + /// Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features. /// - public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class GeneralizedAdditiveModelBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Show statistics of training examples. + /// Should we use derivatives optimized for unbalanced sets /// - public bool ShowTrainingStats { get; set; } = false; + public bool UnbalancedSets { get; set; } = false; /// - /// L2 regularization weight + /// The calibrator kind to apply to the predictor. Specify null for no calibration /// - [TlcModule.SweepableFloatParamAttribute("L2Weight", 0f, 1f, numSteps:4)] - public float L2Weight { get; set; } = 1f; + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); /// - /// L1 regularization weight + /// The maximum number of examples to use when training the calibrator /// - [TlcModule.SweepableFloatParamAttribute("L1Weight", 0f, 1f, numSteps:4)] - public float L1Weight { get; set; } = 1f; + public int MaxCalibrationExamples { get; set; } = 1000000; /// - /// Tolerance parameter for optimization convergence. Lower = slower, more accurate + /// The entropy (regularization) coefficient between 0 and 1 /// - [TlcModule.SweepableDiscreteParamAttribute("OptTol", new object[]{0.0001f, 1E-07f})] - public float OptTol { get; set; } = 1E-07f; + public double EntropyCoefficient { get; set; } /// - /// Memory size for L-BFGS. Lower=faster, less accurate + /// Tree fitting gain confidence requirement (should be in the range [0,1) ). /// - [TlcModule.SweepableDiscreteParamAttribute("MemorySize", new object[]{5, 20, 50})] - public int MemorySize { get; set; } = 20; + public int GainConfidenceLevel { get; set; } /// - /// Maximum iterations. + /// Total number of iterations over all features /// - [TlcModule.SweepableLongParamAttribute("MaxIterations", 1, 2147483647)] - public int MaxIterations { get; set; } = 2147483647; + [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{200, 1500, 9500})] + public int NumIterations { get; set; } = 9500; /// - /// Run SGD to initialize LR weights, converging to this tolerance + /// The number of threads to use /// - public float SgdInitializationTolerance { get; set; } + public int? NumThreads { get; set; } /// - /// If set to true, produce no output during training. + /// The learning rate /// - public bool Quiet { get; set; } = false; + [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.001f, 0.1f, isLogScale:true)] + public double LearningRates { get; set; } = 0.002d; /// - /// Init weights diameter + /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose /// - [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] - public float InitWtsDiameter { get; set; } + public bool? DiskTranspose { get; set; } /// - /// Whether or not to use threads. Default is true + /// Maximum number of distinct values (bins) per feature /// - public bool UseThreads { get; set; } = true; + public int MaxBins { get; set; } = 255; /// - /// Number of threads + /// Upper bound on absolute value of single output /// - public int? NumThreads { get; set; } + public double MaxOutput { get; set; } = double.PositiveInfinity; /// - /// Force densification of the internal optimization vectors + /// Sample each query 1 in k times in the GetDerivatives function /// - [TlcModule.SweepableDiscreteParamAttribute("DenseOptimizer", new object[]{false, true})] - public bool DenseOptimizer { get; set; } = false; + public int GetDerivativesSampleRate { get; set; } = 1; /// - /// Enforce non-negative weights + /// The seed of the random number generator /// - public bool EnforceNonNegativity { get; set; } = false; + public int RngSeed { get; set; } = 123; + + /// + /// Minimum number of training instances required to form a partition + /// + [TlcModule.SweepableDiscreteParamAttribute("MinDocuments", new object[]{1, 10, 50})] + public int MinDocuments { get; set; } = 10; + + /// + /// Whether to collectivize features during dataset preparation to speed up training + /// + public bool FeatureFlocks { get; set; } = true; /// /// Column to use for example weight @@ -6672,18 +6835,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LogisticRegressionBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(GeneralizedAdditiveModelBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new LogisticRegressionBinaryClassifierPipelineStep(output); + return new GeneralizedAdditiveModelBinaryClassifierPipelineStep(output); } - private class LogisticRegressionBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class GeneralizedAdditiveModelBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public LogisticRegressionBinaryClassifierPipelineStep(Output output) + public GeneralizedAdditiveModelBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -6697,83 +6860,74 @@ namespace Trainers { /// - /// Train a logistic regression multi class model + /// Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features. /// - public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class GeneralizedAdditiveModelRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Show statistics of training examples. - /// - public bool ShowTrainingStats { get; set; } = false; - - /// - /// L2 regularization weight + /// The entropy (regularization) coefficient between 0 and 1 /// - [TlcModule.SweepableFloatParamAttribute("L2Weight", 0f, 1f, numSteps:4)] - public float L2Weight { get; set; } = 1f; + public double EntropyCoefficient { get; set; } /// - /// L1 regularization weight + /// Tree fitting gain confidence requirement (should be in the range [0,1) ). /// - [TlcModule.SweepableFloatParamAttribute("L1Weight", 0f, 1f, numSteps:4)] - public float L1Weight { get; set; } = 1f; + public int GainConfidenceLevel { get; set; } /// - /// Tolerance parameter for optimization convergence. Lower = slower, more accurate + /// Total number of iterations over all features /// - [TlcModule.SweepableDiscreteParamAttribute("OptTol", new object[]{0.0001f, 1E-07f})] - public float OptTol { get; set; } = 1E-07f; + [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{200, 1500, 9500})] + public int NumIterations { get; set; } = 9500; /// - /// Memory size for L-BFGS. Lower=faster, less accurate + /// The number of threads to use /// - [TlcModule.SweepableDiscreteParamAttribute("MemorySize", new object[]{5, 20, 50})] - public int MemorySize { get; set; } = 20; + public int? NumThreads { get; set; } /// - /// Maximum iterations. + /// The learning rate /// - [TlcModule.SweepableLongParamAttribute("MaxIterations", 1, 2147483647)] - public int MaxIterations { get; set; } = 2147483647; + [TlcModule.SweepableFloatParamAttribute("LearningRates", 0.001f, 0.1f, isLogScale:true)] + public double LearningRates { get; set; } = 0.002d; /// - /// Run SGD to initialize LR weights, converging to this tolerance + /// Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose /// - public float SgdInitializationTolerance { get; set; } + public bool? DiskTranspose { get; set; } /// - /// If set to true, produce no output during training. + /// Maximum number of distinct values (bins) per feature /// - public bool Quiet { get; set; } = false; + public int MaxBins { get; set; } = 255; /// - /// Init weights diameter + /// Upper bound on absolute value of single output /// - [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] - public float InitWtsDiameter { get; set; } + public double MaxOutput { get; set; } = double.PositiveInfinity; /// - /// Whether or not to use threads. Default is true + /// Sample each query 1 in k times in the GetDerivatives function /// - public bool UseThreads { get; set; } = true; + public int GetDerivativesSampleRate { get; set; } = 1; /// - /// Number of threads + /// The seed of the random number generator /// - public int? NumThreads { get; set; } + public int RngSeed { get; set; } = 123; /// - /// Force densification of the internal optimization vectors + /// Minimum number of training instances required to form a partition /// - [TlcModule.SweepableDiscreteParamAttribute("DenseOptimizer", new object[]{false, true})] - public bool DenseOptimizer { get; set; } = false; + [TlcModule.SweepableDiscreteParamAttribute("MinDocuments", new object[]{1, 10, 50})] + public int MinDocuments { get; set; } = 10; /// - /// Enforce non-negative weights + /// Whether to collectivize features during dataset preparation to speed up training /// - public bool EnforceNonNegativity { get; set; } = false; + public bool FeatureFlocks { get; set; } = true; /// /// Column to use for example weight @@ -6806,7 +6960,7 @@ public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime. public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -6822,18 +6976,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LogisticRegressionClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(GeneralizedAdditiveModelRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new LogisticRegressionClassifierPipelineStep(output); + return new GeneralizedAdditiveModelRegressorPipelineStep(output); } - private class LogisticRegressionClassifierPipelineStep : ILearningPipelinePredictorStep + private class GeneralizedAdditiveModelRegressorPipelineStep : ILearningPipelinePredictorStep { - public LogisticRegressionClassifierPipelineStep(Output output) + public GeneralizedAdditiveModelRegressorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -6845,18 +6999,56 @@ public LogisticRegressionClassifierPipelineStep(Output output) namespace Trainers { + public enum KMeansPlusPlusTrainerInitAlgorithm + { + KMeansPlusPlus = 0, + Random = 1, + KMeansParallel = 2 + } + /// - /// Train a MultiClassNaiveBayesTrainer. + /// K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers. /// - public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class KMeansPlusPlusClusterer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Column to use for labels + /// The number of clusters /// - public string LabelColumn { get; set; } = "Label"; + [TlcModule.SweepableDiscreteParamAttribute("K", new object[]{5, 10, 20, 40})] + public int K { get; set; } = 5; + + /// + /// Cluster initialization algorithm + /// + public KMeansPlusPlusTrainerInitAlgorithm InitAlgorithm { get; set; } = KMeansPlusPlusTrainerInitAlgorithm.KMeansParallel; + + /// + /// Tolerance parameter for trainer convergence. Lower = slower, more accurate + /// + public float OptTol { get; set; } = 1E-07f; + + /// + /// Maximum number of iterations. + /// + public int MaxIterations { get; set; } = 1000; + + /// + /// Memory budget (in MBs) to use for KMeans acceleration + /// + public int AccelMemBudgetMb { get; set; } = 4096; + + /// + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// + public int? NumThreads { get; set; } + + /// + /// Column to use for example weight + /// + public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } /// /// The data to be used for training @@ -6879,7 +7071,7 @@ public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoi public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IClusteringOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -6895,18 +7087,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(NaiveBayesClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(KMeansPlusPlusClusterer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new NaiveBayesClassifierPipelineStep(output); + return new KMeansPlusPlusClustererPipelineStep(output); } - private class NaiveBayesClassifierPipelineStep : ILearningPipelinePredictorStep + private class KMeansPlusPlusClustererPipelineStep : ILearningPipelinePredictorStep { - public NaiveBayesClassifierPipelineStep(Output output) + public KMeansPlusPlusClustererPipelineStep(Output output) { Model = output.PredictorModel; } @@ -6920,82 +7112,62 @@ namespace Trainers { /// - /// Train a Online gradient descent perceptron. + /// Train a linear SVM. /// - public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LinearSvmBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Loss Function + /// Regularizer constant /// - [JsonConverter(typeof(ComponentSerializer))] - public RegressionLossFunction LossFunction { get; set; } = new SquaredLossRegressionLossFunction(); + [TlcModule.SweepableFloatParamAttribute("Lambda", 1E-05f, 0.1f, stepSize:10, isLogScale:true)] + public float Lambda { get; set; } = 0.001f; /// - /// Learning rate + /// Batch size /// - [TlcModule.SweepableDiscreteParamAttribute("LearningRate", new object[]{0.01f, 0.1f, 0.5f, 1f})] - public float LearningRate { get; set; } = 0.1f; + public int BatchSize { get; set; } = 1; /// - /// Decrease learning rate + /// Perform projection to unit-ball? Typically used with batch size > 1. /// - [TlcModule.SweepableDiscreteParamAttribute("DecreaseLearningRate", new object[]{false, true})] - public bool DecreaseLearningRate { get; set; } = true; + [TlcModule.SweepableDiscreteParamAttribute("PerformProjection", new object[]{false, true})] + public bool PerformProjection { get; set; } = false; /// - /// Number of examples after which weights will be reset to the current average + /// No bias /// - public long? ResetWeightsAfterXExamples { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("NoBias", new object[]{false, true})] + public bool NoBias { get; set; } = false; /// - /// Instead of updating averaged weights on every example, only update when loss is nonzero + /// The calibrator kind to apply to the predictor. Specify null for no calibration /// - public bool DoLazyUpdates { get; set; } = true; + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); /// - /// L2 Regularization Weight + /// The maximum number of examples to use when training the calibrator /// - [TlcModule.SweepableFloatParamAttribute("L2RegularizerWeight", 0f, 0.5f)] - public float L2RegularizerWeight { get; set; } + public int MaxCalibrationExamples { get; set; } = 1000000; /// - /// Extra weight given to more recent updates + /// Number of iterations /// - public float RecencyGain { get; set; } + [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize:10, isLogScale:true)] + public int NumIterations { get; set; } = 1; /// - /// Whether Recency Gain is multiplicative (vs. additive) + /// Initial Weights and bias, comma-separated /// - public bool RecencyGainMulti { get; set; } = false; + public string InitialWeights { get; set; } /// - /// Do averaging? + /// Init weights diameter /// - public bool Averaged { get; set; } = true; - - /// - /// The inexactness tolerance for averaging - /// - public float AveragedTolerance { get; set; } = 0.01f; - - /// - /// Number of iterations - /// - [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize:10, isLogScale:true)] - public int NumIterations { get; set; } = 1; - - /// - /// Initial Weights and bias, comma-separated - /// - public string InitialWeights { get; set; } - - /// - /// Init weights diameter - /// - [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] - public float InitWtsDiameter { get; set; } + [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] + public float InitWtsDiameter { get; set; } /// /// Whether to shuffle for each training iteration @@ -7034,7 +7206,7 @@ public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtim public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7050,18 +7222,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(OnlineGradientDescentRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LinearSvmBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new OnlineGradientDescentRegressorPipelineStep(output); + return new LinearSvmBinaryClassifierPipelineStep(output); } - private class OnlineGradientDescentRegressorPipelineStep : ILearningPipelinePredictorStep + private class LinearSvmBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public OnlineGradientDescentRegressorPipelineStep(Output output) + public LinearSvmBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7075,40 +7247,94 @@ namespace Trainers { /// - /// Train an PCA Anomaly model. + /// Train a logistic regression binary model /// - public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LogisticRegressionBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// The number of components in the PCA + /// Show statistics of training examples. /// - [TlcModule.SweepableDiscreteParamAttribute("Rank", new object[]{10, 20, 40, 80})] - public int Rank { get; set; } = 20; + public bool ShowTrainingStats { get; set; } = false; /// - /// Oversampling parameter for randomized PCA training + /// L2 regularization weight /// - [TlcModule.SweepableDiscreteParamAttribute("Oversampling", new object[]{10, 20, 40})] - public int Oversampling { get; set; } = 20; + [TlcModule.SweepableFloatParamAttribute("L2Weight", 0f, 1f, numSteps:4)] + public float L2Weight { get; set; } = 1f; /// - /// If enabled, data is centered to be zero mean + /// L1 regularization weight /// - [TlcModule.SweepableDiscreteParamAttribute("Center", new object[]{false, true})] - public bool Center { get; set; } = true; + [TlcModule.SweepableFloatParamAttribute("L1Weight", 0f, 1f, numSteps:4)] + public float L1Weight { get; set; } = 1f; /// - /// The seed for random number generation + /// Tolerance parameter for optimization convergence. Lower = slower, more accurate /// - public int? Seed { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("OptTol", new object[]{0.0001f, 1E-07f})] + public float OptTol { get; set; } = 1E-07f; + + /// + /// Memory size for L-BFGS. Lower=faster, less accurate + /// + [TlcModule.SweepableDiscreteParamAttribute("MemorySize", new object[]{5, 20, 50})] + public int MemorySize { get; set; } = 20; + + /// + /// Maximum iterations. + /// + [TlcModule.SweepableLongParamAttribute("MaxIterations", 1, 2147483647)] + public int MaxIterations { get; set; } = 2147483647; + + /// + /// Run SGD to initialize LR weights, converging to this tolerance + /// + public float SgdInitializationTolerance { get; set; } + + /// + /// If set to true, produce no output during training. + /// + public bool Quiet { get; set; } = false; + + /// + /// Init weights diameter + /// + [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] + public float InitWtsDiameter { get; set; } + + /// + /// Whether or not to use threads. Default is true + /// + public bool UseThreads { get; set; } = true; + + /// + /// Number of threads + /// + public int? NumThreads { get; set; } + + /// + /// Force densification of the internal optimization vectors + /// + [TlcModule.SweepableDiscreteParamAttribute("DenseOptimizer", new object[]{false, true})] + public bool DenseOptimizer { get; set; } = false; + + /// + /// Enforce non-negative weights + /// + public bool EnforceNonNegativity { get; set; } = false; /// /// Column to use for example weight /// public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; + /// /// The data to be used for training /// @@ -7130,7 +7356,7 @@ public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoint public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7146,18 +7372,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(PcaAnomalyDetector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LogisticRegressionBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new PcaAnomalyDetectorPipelineStep(output); + return new LogisticRegressionBinaryClassifierPipelineStep(output); } - private class PcaAnomalyDetectorPipelineStep : ILearningPipelinePredictorStep + private class LogisticRegressionBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public PcaAnomalyDetectorPipelineStep(Output output) + public LogisticRegressionBinaryClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7171,12 +7397,17 @@ namespace Trainers { /// - /// Train an Poisson regression model. + /// Train a logistic regression multi class model /// - public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LogisticRegressionClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { + /// + /// Show statistics of training examples. + /// + public bool ShowTrainingStats { get; set; } = false; + /// /// L2 regularization weight /// @@ -7275,7 +7506,7 @@ public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints. public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7291,18 +7522,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(PoissonRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LogisticRegressionClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new PoissonRegressorPipelineStep(output); + return new LogisticRegressionClassifierPipelineStep(output); } - private class PoissonRegressorPipelineStep : ILearningPipelinePredictorStep + private class LogisticRegressionClassifierPipelineStep : ILearningPipelinePredictorStep { - public PoissonRegressorPipelineStep(Output output) + public LogisticRegressionClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7316,80 +7547,12 @@ namespace Trainers { /// - /// Train an SDCA binary model. + /// Train a MultiClassNaiveBayesTrainer. /// - public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { - /// - /// Loss Function - /// - [JsonConverter(typeof(ComponentSerializer))] - public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); - - /// - /// Apply weight to the positive class, for imbalanced data - /// - public float PositiveInstanceWeight { get; set; } = 1f; - - /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration - /// - [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); - - /// - /// The maximum number of examples to use when training the calibrator - /// - public int MaxCalibrationExamples { get; set; } = 1000000; - - /// - /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. - /// - [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] - public float? L2Const { get; set; } - - /// - /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. - /// - [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] - public float? L1Threshold { get; set; } - - /// - /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. - /// - public int? NumThreads { get; set; } - - /// - /// The tolerance for the ratio between duality gap and primal loss for convergence checking. - /// - [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] - public float ConvergenceTolerance { get; set; } = 0.1f; - - /// - /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. - /// - [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] - public int? MaxIterations { get; set; } - - /// - /// Shuffle data every epoch? - /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; - - /// - /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. - /// - public int? CheckFrequency { get; set; } - - /// - /// The learning rate for adjusting bias from being regularized. - /// - [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] - public float BiasLearningRate { get; set; } - /// /// Column to use for labels /// @@ -7416,7 +7579,7 @@ public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Mic public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7432,18 +7595,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(NaiveBayesClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new StochasticDualCoordinateAscentBinaryClassifierPipelineStep(output); + return new NaiveBayesClassifierPipelineStep(output); } - private class StochasticDualCoordinateAscentBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class NaiveBayesClassifierPipelineStep : ILearningPipelinePredictorStep { - public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output) + public NaiveBayesClassifierPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7457,9 +7620,9 @@ namespace Trainers { /// - /// Train an SDCA multi class model + /// Train a Online gradient descent perceptron. /// - public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class OnlineGradientDescentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { @@ -7467,53 +7630,83 @@ public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft /// Loss Function /// [JsonConverter(typeof(ComponentSerializer))] - public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); + public RegressionLossFunction LossFunction { get; set; } = new SquaredLossRegressionLossFunction(); /// - /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. + /// Learning rate /// - [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] - public float? L2Const { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("LearningRate", new object[]{0.01f, 0.1f, 0.5f, 1f})] + public float LearningRate { get; set; } = 0.1f; /// - /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. + /// Decrease learning rate /// - [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] - public float? L1Threshold { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("DecreaseLearningRate", new object[]{false, true})] + public bool DecreaseLearningRate { get; set; } = true; /// - /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// Number of examples after which weights will be reset to the current average /// - public int? NumThreads { get; set; } + public long? ResetWeightsAfterXExamples { get; set; } /// - /// The tolerance for the ratio between duality gap and primal loss for convergence checking. + /// Instead of updating averaged weights on every example, only update when loss is nonzero /// - [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] - public float ConvergenceTolerance { get; set; } = 0.1f; + public bool DoLazyUpdates { get; set; } = true; /// - /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. + /// L2 Regularization Weight /// - [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] - public int? MaxIterations { get; set; } + [TlcModule.SweepableFloatParamAttribute("L2RegularizerWeight", 0f, 0.5f)] + public float L2RegularizerWeight { get; set; } /// - /// Shuffle data every epoch? + /// Extra weight given to more recent updates /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; + public float RecencyGain { get; set; } /// - /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. + /// Whether Recency Gain is multiplicative (vs. additive) /// - public int? CheckFrequency { get; set; } + public bool RecencyGainMulti { get; set; } = false; /// - /// The learning rate for adjusting bias from being regularized. + /// Do averaging? /// - [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] - public float BiasLearningRate { get; set; } + public bool Averaged { get; set; } = true; + + /// + /// The inexactness tolerance for averaging + /// + public float AveragedTolerance { get; set; } = 0.01f; + + /// + /// Number of iterations + /// + [TlcModule.SweepableLongParamAttribute("NumIterations", 1, 100, stepSize:10, isLogScale:true)] + public int NumIterations { get; set; } = 1; + + /// + /// Initial Weights and bias, comma-separated + /// + public string InitialWeights { get; set; } + + /// + /// Init weights diameter + /// + [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] + public float InitWtsDiameter { get; set; } + + /// + /// Whether to shuffle for each training iteration + /// + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; + + /// + /// Size of cache when trained in Scope + /// + public int StreamingCacheSize { get; set; } = 1000000; /// /// Column to use for labels @@ -7541,7 +7734,7 @@ public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7557,18 +7750,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(OnlineGradientDescentRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new StochasticDualCoordinateAscentClassifierPipelineStep(output); + return new OnlineGradientDescentRegressorPipelineStep(output); } - private class StochasticDualCoordinateAscentClassifierPipelineStep : ILearningPipelinePredictorStep + private class OnlineGradientDescentRegressorPipelineStep : ILearningPipelinePredictorStep { - public StochasticDualCoordinateAscentClassifierPipelineStep(Output output) + public OnlineGradientDescentRegressorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7582,68 +7775,39 @@ namespace Trainers { /// - /// Train an SDCA regression model + /// Train an PCA Anomaly model. /// - public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IUnsupervisedTrainerWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Loss Function - /// - [JsonConverter(typeof(ComponentSerializer))] - public SDCARegressionLossFunction LossFunction { get; set; } = new SquaredLossSDCARegressionLossFunction(); - - /// - /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. - /// - [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] - public float? L2Const { get; set; } - - /// - /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. - /// - [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] - public float? L1Threshold { get; set; } - - /// - /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. - /// - public int? NumThreads { get; set; } - - /// - /// The tolerance for the ratio between duality gap and primal loss for convergence checking. - /// - [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] - public float ConvergenceTolerance { get; set; } = 0.01f; - - /// - /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. + /// The number of components in the PCA /// - [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] - public int? MaxIterations { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("Rank", new object[]{10, 20, 40, 80})] + public int Rank { get; set; } = 20; /// - /// Shuffle data every epoch? + /// Oversampling parameter for randomized PCA training /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; + [TlcModule.SweepableDiscreteParamAttribute("Oversampling", new object[]{10, 20, 40})] + public int Oversampling { get; set; } = 20; /// - /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. + /// If enabled, data is centered to be zero mean /// - public int? CheckFrequency { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("Center", new object[]{false, true})] + public bool Center { get; set; } = true; /// - /// The learning rate for adjusting bias from being regularized. + /// The seed for random number generation /// - [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] - public float BiasLearningRate { get; set; } = 1f; + public int? Seed { get; set; } /// - /// Column to use for labels + /// Column to use for example weight /// - public string LabelColumn { get; set; } = "Label"; + public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } /// /// The data to be used for training @@ -7666,7 +7830,7 @@ public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft. public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7682,18 +7846,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(PcaAnomalyDetector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new StochasticDualCoordinateAscentRegressorPipelineStep(output); + return new PcaAnomalyDetectorPipelineStep(output); } - private class StochasticDualCoordinateAscentRegressorPipelineStep : ILearningPipelinePredictorStep + private class PcaAnomalyDetectorPipelineStep : ILearningPipelinePredictorStep { - public StochasticDualCoordinateAscentRegressorPipelineStep(Output output) + public PcaAnomalyDetectorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7707,72 +7871,78 @@ namespace Trainers { /// - /// Train an Hogwild SGD binary model. + /// Train an Poisson regression model. /// - public sealed partial class StochasticGradientDescentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class PoissonRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Loss Function + /// L2 regularization weight /// - [JsonConverter(typeof(ComponentSerializer))] - public ClassificationLossFunction LossFunction { get; set; } = new LogLossClassificationLossFunction(); + [TlcModule.SweepableFloatParamAttribute("L2Weight", 0f, 1f, numSteps:4)] + public float L2Weight { get; set; } = 1f; /// - /// L2 regularizer constant + /// L1 regularization weight /// - [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{1E-07f, 5E-07f, 1E-06f, 5E-06f, 1E-05f})] - public float L2Const { get; set; } = 1E-06f; + [TlcModule.SweepableFloatParamAttribute("L1Weight", 0f, 1f, numSteps:4)] + public float L1Weight { get; set; } = 1f; /// - /// Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed. + /// Tolerance parameter for optimization convergence. Lower = slower, more accurate /// - public int? NumThreads { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("OptTol", new object[]{0.0001f, 1E-07f})] + public float OptTol { get; set; } = 1E-07f; /// - /// Exponential moving averaged improvement tolerance for convergence + /// Memory size for L-BFGS. Lower=faster, less accurate /// - [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.01f, 0.001f, 0.0001f, 1E-05f})] - public double ConvergenceTolerance { get; set; } = 0.0001d; + [TlcModule.SweepableDiscreteParamAttribute("MemorySize", new object[]{5, 20, 50})] + public int MemorySize { get; set; } = 20; /// - /// Maximum number of iterations; set to 1 to simulate online learning. + /// Maximum iterations. /// - [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{1, 5, 10, 20})] - public int MaxIterations { get; set; } = 20; + [TlcModule.SweepableLongParamAttribute("MaxIterations", 1, 2147483647)] + public int MaxIterations { get; set; } = 2147483647; /// - /// Initial learning rate (only used by SGD) + /// Run SGD to initialize LR weights, converging to this tolerance /// - public double InitLearningRate { get; set; } = 0.01d; + public float SgdInitializationTolerance { get; set; } /// - /// Shuffle data every epoch? + /// If set to true, produce no output during training. /// - [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] - public bool Shuffle { get; set; } = true; + public bool Quiet { get; set; } = false; /// - /// Apply weight to the positive class, for imbalanced data + /// Init weights diameter /// - public float PositiveInstanceWeight { get; set; } = 1f; + [TlcModule.SweepableFloatParamAttribute("InitWtsDiameter", 0f, 1f, numSteps:5)] + public float InitWtsDiameter { get; set; } /// - /// Convergence check frequency (in terms of number of iterations). Default equals number of threads + /// Whether or not to use threads. Default is true /// - public int? CheckFrequency { get; set; } + public bool UseThreads { get; set; } = true; /// - /// The calibrator kind to apply to the predictor. Specify null for no calibration + /// Number of threads /// - [JsonConverter(typeof(ComponentSerializer))] - public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); + public int? NumThreads { get; set; } /// - /// The maximum number of examples to use when training the calibrator + /// Force densification of the internal optimization vectors /// - public int MaxCalibrationExamples { get; set; } = 1000000; + [TlcModule.SweepableDiscreteParamAttribute("DenseOptimizer", new object[]{false, true})] + public bool DenseOptimizer { get; set; } = false; + + /// + /// Enforce non-negative weights + /// + public bool EnforceNonNegativity { get; set; } = false; /// /// Column to use for example weight @@ -7805,7 +7975,7 @@ public sealed partial class StochasticGradientDescentBinaryClassifier : Microsof public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// /// The trained model @@ -7821,18 +7991,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(StochasticGradientDescentBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(PoissonRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new StochasticGradientDescentBinaryClassifierPipelineStep(output); + return new PoissonRegressorPipelineStep(output); } - private class StochasticGradientDescentBinaryClassifierPipelineStep : ILearningPipelinePredictorStep + private class PoissonRegressorPipelineStep : ILearningPipelinePredictorStep { - public StochasticGradientDescentBinaryClassifierPipelineStep(Output output) + public PoissonRegressorPipelineStep(Output output) { Model = output.PredictorModel; } @@ -7842,121 +8012,119 @@ public StochasticGradientDescentBinaryClassifierPipelineStep(Output output) } } - namespace Transforms + namespace Trainers { /// - /// Approximate bootstrap sampling. + /// Train an SDCA binary model. /// - public sealed partial class ApproximateBootstrapSampler : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class StochasticDualCoordinateAscentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { /// - /// Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform. + /// Loss Function /// - public bool Complement { get; set; } = false; + [JsonConverter(typeof(ComponentSerializer))] + public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); /// - /// The random seed. If unspecified random state will be instead derived from the environment. + /// Apply weight to the positive class, for imbalanced data /// - public uint? Seed { get; set; } + public float PositiveInstanceWeight { get; set; } = 1f; /// - /// Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency. + /// The calibrator kind to apply to the predictor. Specify null for no calibration /// - public bool ShuffleInput { get; set; } = true; + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); /// - /// When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input. + /// The maximum number of examples to use when training the calibrator /// - public int PoolSize { get; set; } = 1000; + public int MaxCalibrationExamples { get; set; } = 1000000; /// - /// Input dataset + /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. /// - public Var Data { get; set; } = new Var(); + [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] + public float? L2Const { get; set; } + /// + /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. + /// + [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] + public float? L1Threshold { get; set; } - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput - { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); + /// + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. + /// + public int? NumThreads { get; set; } - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); + /// + /// The tolerance for the ratio between duality gap and primal loss for convergence checking. + /// + [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] + public float ConvergenceTolerance { get; set; } = 0.1f; - } - public Var GetInputData() => Data; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(ApproximateBootstrapSampler)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } + /// + /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. + /// + [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] + public int? MaxIterations { get; set; } - Data = dataStep.Data; - } - Output output = experiment.Add(this); - return new ApproximateBootstrapSamplerPipelineStep(output); - } + /// + /// Shuffle data every epoch? + /// + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; - private class ApproximateBootstrapSamplerPipelineStep : ILearningPipelineDataStep - { - public ApproximateBootstrapSamplerPipelineStep(Output output) - { - Data = output.OutputData; - Model = output.Model; - } + /// + /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. + /// + public int? CheckFrequency { get; set; } - public Var Data { get; } - public Var Model { get; } - } - } - } + /// + /// The learning rate for adjusting bias from being regularized. + /// + [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] + public float BiasLearningRate { get; set; } - namespace Transforms - { + /// + /// Column to use for labels + /// + public string LabelColumn { get; set; } = "Label"; - /// - /// For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class. - /// - public sealed partial class BinaryPredictionScoreColumnsRenamer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; /// - /// The predictor model used in scoring + /// Normalize option for the feature column /// - public Var PredictorModel { get; set; } = new Var(); + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// - /// Input dataset + /// Whether learner should cache input training data /// - public Var Data { get; set; } = new Var(); + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); - - /// - /// Transform model + /// The trained model /// - public Var Model { get; set; } = new Var(); + public Var PredictorModel { get; set; } = new Var(); } - public Var GetInputData() => Data; + public Var GetInputData() => TrainingData; public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { @@ -7964,148 +8132,124 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(BinaryPredictionScoreColumnsRenamer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } - Data = dataStep.Data; + TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new BinaryPredictionScoreColumnsRenamerPipelineStep(output); + return new StochasticDualCoordinateAscentBinaryClassifierPipelineStep(output); } - private class BinaryPredictionScoreColumnsRenamerPipelineStep : ILearningPipelineDataStep + private class StochasticDualCoordinateAscentBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public BinaryPredictionScoreColumnsRenamerPipelineStep(Output output) + public StochasticDualCoordinateAscentBinaryClassifierPipelineStep(Output output) { - Data = output.OutputData; - Model = output.Model; + Model = output.PredictorModel; } - public Var Data { get; } - public Var Model { get; } + public Var Model { get; } } } } - namespace Transforms + namespace Trainers { - public sealed partial class NormalizeTransformBinColumn : OneToOneColumn, IOneToOneColumn + /// + /// Train an SDCA multi class model + /// + public sealed partial class StochasticDualCoordinateAscentClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { + + /// - /// Max number of bins, power of 2 recommended + /// Loss Function /// - public int? NumBins { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public SDCAClassificationLossFunction LossFunction { get; set; } = new LogLossSDCAClassificationLossFunction(); /// - /// Whether to map zero to zero, preserving sparsity + /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. /// - public bool? FixZero { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] + public float? L2Const { get; set; } /// - /// Max number of examples used to train the normalizer + /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. /// - public long? MaxTrainingExamples { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] + public float? L1Threshold { get; set; } /// - /// Name of the new column + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. /// - public string Name { get; set; } + public int? NumThreads { get; set; } /// - /// Name of the source column + /// The tolerance for the ratio between duality gap and primal loss for convergence checking. /// - public string Source { get; set; } - - } + [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] + public float ConvergenceTolerance { get; set; } = 0.1f; - /// - /// The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins. - /// - public sealed partial class BinNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. + /// + [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] + public int? MaxIterations { get; set; } - public BinNormalizer() - { - } - - public BinNormalizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public BinNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } + /// + /// Shuffle data every epoch? + /// + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } + /// + /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. + /// + public int? CheckFrequency { get; set; } + /// + /// The learning rate for adjusting bias from being regularized. + /// + [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] + public float BiasLearningRate { get; set; } /// - /// New column definition(s) (optional form: name:src) + /// Column to use for labels /// - public NormalizeTransformBinColumn[] Column { get; set; } + public string LabelColumn { get; set; } = "Label"; /// - /// Max number of bins, power of 2 recommended + /// The data to be used for training /// - public int NumBins { get; set; } = 1024; + public Var TrainingData { get; set; } = new Var(); /// - /// Whether to map zero to zero, preserving sparsity + /// Column to use for features /// - public bool FixZero { get; set; } = true; + public string FeatureColumn { get; set; } = "Features"; /// - /// Max number of examples used to train the normalizer + /// Normalize option for the feature column /// - public long MaxTrainingExamples { get; set; } = 1000000000; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// - /// Input dataset + /// Whether learner should cache input training data /// - public Var Data { get; set; } = new Var(); + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IMulticlassClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); - - /// - /// Transform model + /// The trained model /// - public Var Model { get; set; } = new Var(); + public Var PredictorModel { get; set; } = new Var(); } - public Var GetInputData() => Data; + public Var GetInputData() => TrainingData; public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { @@ -8113,176 +8257,124 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(BinNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } - Data = dataStep.Data; + TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new BinNormalizerPipelineStep(output); + return new StochasticDualCoordinateAscentClassifierPipelineStep(output); } - private class BinNormalizerPipelineStep : ILearningPipelineDataStep + private class StochasticDualCoordinateAscentClassifierPipelineStep : ILearningPipelinePredictorStep { - public BinNormalizerPipelineStep(Output output) + public StochasticDualCoordinateAscentClassifierPipelineStep(Output output) { - Data = output.OutputData; - Model = output.Model; + Model = output.PredictorModel; } - public Var Data { get; } - public Var Model { get; } + public Var Model { get; } } } } - namespace Transforms + namespace Trainers { - public enum CategoricalTransformOutputKind : byte + + /// + /// Train an SDCA regression model + /// + public sealed partial class StochasticDualCoordinateAscentRegressor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { - Bag = 1, - Ind = 2, - Key = 3, - Bin = 4 - } - public sealed partial class CategoricalHashTransformColumn : OneToOneColumn, IOneToOneColumn - { /// - /// The number of bits to hash into. Must be between 1 and 30, inclusive. + /// Loss Function /// - public int? HashBits { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public SDCARegressionLossFunction LossFunction { get; set; } = new SquaredLossSDCARegressionLossFunction(); /// - /// Hashing seed + /// L2 regularizer constant. By default the l2 constant is automatically inferred based on data set. /// - public uint? Seed { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{"", 1E-07f, 1E-06f, 1E-05f, 0.0001f, 0.001f, 0.01f})] + public float? L2Const { get; set; } /// - /// Whether the position of each term should be included in the hash + /// L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set. /// - public bool? Ordered { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("L1Threshold", new object[]{"", 0f, 0.25f, 0.5f, 0.75f, 1f})] + public float? L1Threshold { get; set; } /// - /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. + /// Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed. /// - public int? InvertHash { get; set; } + public int? NumThreads { get; set; } /// - /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) + /// The tolerance for the ratio between duality gap and primal loss for convergence checking. /// - public CategoricalTransformOutputKind? OutputKind { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.001f, 0.01f, 0.1f, 0.2f})] + public float ConvergenceTolerance { get; set; } = 0.01f; /// - /// Name of the new column + /// Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic. /// - public string Name { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{"", 10, 20, 100})] + public int? MaxIterations { get; set; } /// - /// Name of the source column + /// Shuffle data every epoch? /// - public string Source { get; set; } - - } - - /// - /// Encodes the categorical variable with hash-based encoding - /// - public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { - - public CategoricalHashOneHotVectorizer() - { - } - - public CategoricalHashOneHotVectorizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public CategoricalHashOneHotVectorizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } - + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; /// - /// New column definition(s) (optional form: name:hashBits:src) + /// Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations. /// - public CategoricalHashTransformColumn[] Column { get; set; } + public int? CheckFrequency { get; set; } /// - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// The learning rate for adjusting bias from being regularized. /// - public int HashBits { get; set; } = 16; + [TlcModule.SweepableDiscreteParamAttribute("BiasLearningRate", new object[]{0f, 0.01f, 0.1f, 1f})] + public float BiasLearningRate { get; set; } = 1f; /// - /// Hashing seed + /// Column to use for labels /// - public uint Seed { get; set; } = 314489979; + public string LabelColumn { get; set; } = "Label"; /// - /// Whether the position of each term should be included in the hash + /// The data to be used for training /// - public bool Ordered { get; set; } = true; + public Var TrainingData { get; set; } = new Var(); /// - /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. + /// Column to use for features /// - public int InvertHash { get; set; } + public string FeatureColumn { get; set; } = "Features"; /// - /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) + /// Normalize option for the feature column /// - public CategoricalTransformOutputKind OutputKind { get; set; } = CategoricalTransformOutputKind.Bag; + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; /// - /// Input dataset + /// Whether learner should cache input training data /// - public Var Data { get; set; } = new Var(); + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IRegressionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput { /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); - - /// - /// Transform model + /// The trained model /// - public Var Model { get; set; } = new Var(); + public Var PredictorModel { get; set; } = new Var(); } - public Var GetInputData() => Data; + public Var GetInputData() => TrainingData; public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { @@ -8290,174 +8382,138 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(CategoricalHashOneHotVectorizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(StochasticDualCoordinateAscentRegressor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } - Data = dataStep.Data; + TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new CategoricalHashOneHotVectorizerPipelineStep(output); + return new StochasticDualCoordinateAscentRegressorPipelineStep(output); } - private class CategoricalHashOneHotVectorizerPipelineStep : ILearningPipelineDataStep + private class StochasticDualCoordinateAscentRegressorPipelineStep : ILearningPipelinePredictorStep { - public CategoricalHashOneHotVectorizerPipelineStep(Output output) + public StochasticDualCoordinateAscentRegressorPipelineStep(Output output) { - Data = output.OutputData; - Model = output.Model; + Model = output.PredictorModel; } - public Var Data { get; } - public Var Model { get; } + public Var Model { get; } } } } - namespace Transforms + namespace Trainers { - public enum TermTransformSortOrder : byte + + /// + /// Train an Hogwild SGD binary model. + /// + public sealed partial class StochasticGradientDescentBinaryClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem { - Occurrence = 0, - Value = 1 - } - public sealed partial class CategoricalTransformColumn : OneToOneColumn, IOneToOneColumn - { /// - /// Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector + /// Loss Function /// - public CategoricalTransformOutputKind? OutputKind { get; set; } + [JsonConverter(typeof(ComponentSerializer))] + public ClassificationLossFunction LossFunction { get; set; } = new LogLossClassificationLossFunction(); /// - /// Maximum number of terms to keep when auto-training + /// L2 regularizer constant /// - public int? MaxNumTerms { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("L2Const", new object[]{1E-07f, 5E-07f, 1E-06f, 5E-06f, 1E-05f})] + public float L2Const { get; set; } = 1E-06f; /// - /// List of terms + /// Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed. /// - public string[] Term { get; set; } + public int? NumThreads { get; set; } /// - /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// Exponential moving averaged improvement tolerance for convergence /// - public TermTransformSortOrder? Sort { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("ConvergenceTolerance", new object[]{0.01f, 0.001f, 0.0001f, 1E-05f})] + public double ConvergenceTolerance { get; set; } = 0.0001d; /// - /// Whether key value metadata should be text, regardless of the actual input type + /// Maximum number of iterations; set to 1 to simulate online learning. /// - public bool? TextKeyValues { get; set; } + [TlcModule.SweepableDiscreteParamAttribute("MaxIterations", new object[]{1, 5, 10, 20})] + public int MaxIterations { get; set; } = 20; /// - /// Name of the new column + /// Initial learning rate (only used by SGD) /// - public string Name { get; set; } + public double InitLearningRate { get; set; } = 0.01d; /// - /// Name of the source column + /// Shuffle data every epoch? /// - public string Source { get; set; } - - } - - /// - /// Encodes the categorical variable with one-hot encoding based on term dictionary - /// - public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { - - public CategoricalOneHotVectorizer() - { - } - - public CategoricalOneHotVectorizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public CategoricalOneHotVectorizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } - + [TlcModule.SweepableDiscreteParamAttribute("Shuffle", new object[]{false, true})] + public bool Shuffle { get; set; } = true; /// - /// New column definition(s) (optional form: name:src) + /// Apply weight to the positive class, for imbalanced data /// - public CategoricalTransformColumn[] Column { get; set; } + public float PositiveInstanceWeight { get; set; } = 1f; /// - /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) + /// Convergence check frequency (in terms of number of iterations). Default equals number of threads /// - public CategoricalTransformOutputKind OutputKind { get; set; } = CategoricalTransformOutputKind.Ind; + public int? CheckFrequency { get; set; } /// - /// Maximum number of terms to keep per column when auto-training + /// The calibrator kind to apply to the predictor. Specify null for no calibration /// - public int MaxNumTerms { get; set; } = 1000000; + [JsonConverter(typeof(ComponentSerializer))] + public CalibratorTrainer Calibrator { get; set; } = new PlattCalibratorCalibratorTrainer(); /// - /// List of terms + /// The maximum number of examples to use when training the calibrator /// - public string[] Term { get; set; } + public int MaxCalibrationExamples { get; set; } = 1000000; /// - /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// Column to use for example weight /// - public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } /// - /// Whether key value metadata should be text, regardless of the actual input type + /// Column to use for labels /// - public bool TextKeyValues { get; set; } = true; + public string LabelColumn { get; set; } = "Label"; /// - /// Input dataset + /// The data to be used for training /// - public Var Data { get; set; } = new Var(); + public Var TrainingData { get; set; } = new Var(); + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Microsoft.ML.Models.NormalizeOption NormalizeFeatures { get; set; } = Microsoft.ML.Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Microsoft.ML.Models.CachingOptions Caching { get; set; } = Microsoft.ML.Models.CachingOptions.Auto; - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput - { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IBinaryClassificationOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { /// - /// Transform model + /// The trained model /// - public Var Model { get; set; } = new Var(); + public Var PredictorModel { get; set; } = new Var(); } - public Var GetInputData() => Data; + public Var GetInputData() => TrainingData; public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { @@ -8465,25 +8521,23 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(CategoricalOneHotVectorizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(StochasticGradientDescentBinaryClassifier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } - Data = dataStep.Data; + TrainingData = dataStep.Data; } Output output = experiment.Add(this); - return new CategoricalOneHotVectorizerPipelineStep(output); + return new StochasticGradientDescentBinaryClassifierPipelineStep(output); } - private class CategoricalOneHotVectorizerPipelineStep : ILearningPipelineDataStep + private class StochasticGradientDescentBinaryClassifierPipelineStep : ILearningPipelinePredictorStep { - public CategoricalOneHotVectorizerPipelineStep(Output output) + public StochasticGradientDescentBinaryClassifierPipelineStep(Output output) { - Data = output.OutputData; - Model = output.Model; + Model = output.PredictorModel; } - public Var Data { get; } - public Var Model { get; } + public Var Model { get; } } } } @@ -8491,76 +8545,32 @@ public CategoricalOneHotVectorizerPipelineStep(Output output) namespace Transforms { - public sealed partial class CharTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn - { - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string Source { get; set; } - - } - /// - /// Character-oriented tokenizer where text is considered a sequence of characters. + /// Approximate bootstrap sampling. /// - public sealed partial class CharacterTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ApproximateBootstrapSampler : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public CharacterTokenizer() - { - } - - public CharacterTokenizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public CharacterTokenizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } + /// + /// Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform. + /// + public bool Complement { get; set; } = false; + /// + /// The random seed. If unspecified random state will be instead derived from the environment. + /// + public uint? Seed { get; set; } /// - /// New column definition(s) (optional form: name:src) + /// Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency. /// - public CharTokenizeTransformColumn[] Column { get; set; } + public bool ShuffleInput { get; set; } = true; /// - /// Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03) + /// When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input. /// - public bool UseMarkerChars { get; set; } = true; + public int PoolSize { get; set; } = 1000; /// /// Input dataset @@ -8589,18 +8599,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(CharacterTokenizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ApproximateBootstrapSampler)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new CharacterTokenizerPipelineStep(output); + return new ApproximateBootstrapSamplerPipelineStep(output); } - private class CharacterTokenizerPipelineStep : ILearningPipelineDataStep + private class ApproximateBootstrapSamplerPipelineStep : ILearningPipelineDataStep { - public CharacterTokenizerPipelineStep(Output output) + public ApproximateBootstrapSamplerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -8615,47 +8625,17 @@ public CharacterTokenizerPipelineStep(Output output) namespace Transforms { - public sealed partial class ConcatTransformColumn : ManyToOneColumn, IManyToOneColumn - { - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string[] Source { get; set; } - - } - /// - /// Concatenates two columns of the same item type. + /// For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class. /// - public sealed partial class ColumnConcatenator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class BinaryPredictionScoreColumnsRenamer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public ColumnConcatenator() - { - } - - public ColumnConcatenator(string outputColumn, params string[] inputColumns) - { - AddColumn(outputColumn, inputColumns); - } - - public void AddColumn(string name, params string[] source) - { - var list = Column == null ? new List() : new List(Column); - list.Add(ManyToOneColumn.Create(name, source)); - Column = list.ToArray(); - } - /// - /// New column definition(s) (optional form: name:srcs) + /// The predictor model used in scoring /// - public ConcatTransformColumn[] Column { get; set; } + public Var PredictorModel { get; set; } = new Var(); /// /// Input dataset @@ -8684,18 +8664,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ColumnConcatenator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(BinaryPredictionScoreColumnsRenamer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new ColumnConcatenatorPipelineStep(output); + return new BinaryPredictionScoreColumnsRenamerPipelineStep(output); } - private class ColumnConcatenatorPipelineStep : ILearningPipelineDataStep + private class BinaryPredictionScoreColumnsRenamerPipelineStep : ILearningPipelineDataStep { - public ColumnConcatenatorPipelineStep(Output output) + public BinaryPredictionScoreColumnsRenamerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -8710,8 +8690,23 @@ public ColumnConcatenatorPipelineStep(Output output) namespace Transforms { - public sealed partial class CopyColumnsTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NormalizeTransformBinColumn : OneToOneColumn, IOneToOneColumn { + /// + /// Max number of bins, power of 2 recommended + /// + public int? NumBins { get; set; } + + /// + /// Whether to map zero to zero, preserving sparsity + /// + public bool? FixZero { get; set; } + + /// + /// Max number of examples used to train the normalizer + /// + public long? MaxTrainingExamples { get; set; } + /// /// Name of the new column /// @@ -8725,16 +8720,16 @@ public sealed partial class CopyColumnsTransformColumn : OneToOneColumn - /// Duplicates columns from the dataset + /// The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins. /// - public sealed partial class ColumnCopier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class BinNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public ColumnCopier() + public BinNormalizer() { } - public ColumnCopier(params string[] inputColumns) + public BinNormalizer(params string[] inputColumns) { if (inputColumns != null) { @@ -8745,7 +8740,7 @@ public ColumnCopier(params string[] inputColumns) } } - public ColumnCopier(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public BinNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -8758,15 +8753,15 @@ public ColumnCopier(params (string inputColumn, string outputColumn)[] inputOutp public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } @@ -8774,7 +8769,22 @@ public void AddColumn(string outputColumn, string inputColumn) /// /// New column definition(s) (optional form: name:src) /// - public CopyColumnsTransformColumn[] Column { get; set; } + public NormalizeTransformBinColumn[] Column { get; set; } + + /// + /// Max number of bins, power of 2 recommended + /// + public int NumBins { get; set; } = 1024; + + /// + /// Whether to map zero to zero, preserving sparsity + /// + public bool FixZero { get; set; } = true; + + /// + /// Max number of examples used to train the normalizer + /// + public long MaxTrainingExamples { get; set; } = 1000000000; /// /// Input dataset @@ -8803,18 +8813,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ColumnCopier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(BinNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new ColumnCopierPipelineStep(output); + return new BinNormalizerPipelineStep(output); } - private class ColumnCopierPipelineStep : ILearningPipelineDataStep + private class BinNormalizerPipelineStep : ILearningPipelineDataStep { - public ColumnCopierPipelineStep(Output output) + public BinNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -8828,83 +8838,130 @@ public ColumnCopierPipelineStep(Output output) namespace Transforms { - - /// - /// Drops columns from the dataset - /// - public sealed partial class ColumnDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public enum CategoricalTransformOutputKind : byte { - - - /// - /// Column name to drop + Bag = 1, + Ind = 2, + Key = 3, + Bin = 4 + } + + + public sealed partial class CategoricalHashTransformColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// The number of bits to hash into. Must be between 1 and 30, inclusive. /// - public string[] Column { get; set; } + public int? HashBits { get; set; } /// - /// Input dataset + /// Hashing seed /// - public Var Data { get; set; } = new Var(); + public uint? Seed { get; set; } + /// + /// Whether the position of each term should be included in the hash + /// + public bool? Ordered { get; set; } - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput - { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); + /// + /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. + /// + public int? InvertHash { get; set; } - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); + /// + /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) + /// + public CategoricalTransformOutputKind? OutputKind { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + + /// + /// Encodes the categorical variable with hash-based encoding + /// + public sealed partial class CategoricalHashOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + public CategoricalHashOneHotVectorizer() + { } - public Var GetInputData() => Data; - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + public CategoricalHashOneHotVectorizer(params string[] inputColumns) { - if (previousStep != null) + if (inputColumns != null) { - if (!(previousStep is ILearningPipelineDataStep dataStep)) + foreach (string input in inputColumns) { - throw new InvalidOperationException($"{ nameof(ColumnDropper)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + AddColumn(input); } - - Data = dataStep.Data; } - Output output = experiment.Add(this); - return new ColumnDropperPipelineStep(output); } - - private class ColumnDropperPipelineStep : ILearningPipelineDataStep + + public CategoricalHashOneHotVectorizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { - public ColumnDropperPipelineStep(Output output) + if (inputOutputColumns != null) { - Data = output.OutputData; - Model = output.Model; + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } - public Var Data { get; } - public Var Model { get; } + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); } - } - } - namespace Transforms - { - /// - /// Selects a set of columns, dropping all others - /// - public sealed partial class ColumnSelector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// New column definition(s) (optional form: name:hashBits:src) + /// + public CategoricalHashTransformColumn[] Column { get; set; } + /// + /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// + public int HashBits { get; set; } = 16; /// - /// Column name to keep + /// Hashing seed /// - public string[] Column { get; set; } + public uint Seed { get; set; } = 314489979; + + /// + /// Whether the position of each term should be included in the hash + /// + public bool Ordered { get; set; } = true; + + /// + /// Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit. + /// + public int InvertHash { get; set; } + + /// + /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) + /// + public CategoricalTransformOutputKind OutputKind { get; set; } = CategoricalTransformOutputKind.Bag; /// /// Input dataset @@ -8933,18 +8990,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ColumnSelector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(CategoricalHashOneHotVectorizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new ColumnSelectorPipelineStep(output); + return new CategoricalHashOneHotVectorizerPipelineStep(output); } - private class ColumnSelectorPipelineStep : ILearningPipelineDataStep + private class CategoricalHashOneHotVectorizerPipelineStep : ILearningPipelineDataStep { - public ColumnSelectorPipelineStep(Output output) + public CategoricalHashOneHotVectorizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -8958,18 +9015,39 @@ public ColumnSelectorPipelineStep(Output output) namespace Transforms { + public enum TermTransformSortOrder : byte + { + Occurrence = 0, + Value = 1 + } - public sealed partial class ConvertTransformColumn : OneToOneColumn, IOneToOneColumn + + public sealed partial class CategoricalTransformColumn : OneToOneColumn, IOneToOneColumn { /// - /// The result type + /// Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector /// - public Microsoft.ML.Data.DataKind? ResultType { get; set; } + public CategoricalTransformOutputKind? OutputKind { get; set; } /// - /// For a key column, this defines the range of values + /// Maximum number of terms to keep when auto-training /// - public string Range { get; set; } + public int? MaxNumTerms { get; set; } + + /// + /// List of terms + /// + public string[] Term { get; set; } + + /// + /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// + public TermTransformSortOrder? Sort { get; set; } + + /// + /// Whether key value metadata should be text, regardless of the actual input type + /// + public bool? TextKeyValues { get; set; } /// /// Name of the new column @@ -8984,16 +9062,16 @@ public sealed partial class ConvertTransformColumn : OneToOneColumn - /// Converts a column to a different type, using standard conversions. + /// Encodes the categorical variable with one-hot encoding based on term dictionary /// - public sealed partial class ColumnTypeConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class CategoricalOneHotVectorizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public ColumnTypeConverter() + public CategoricalOneHotVectorizer() { } - public ColumnTypeConverter(params string[] inputColumns) + public CategoricalOneHotVectorizer(params string[] inputColumns) { if (inputColumns != null) { @@ -9004,7 +9082,7 @@ public ColumnTypeConverter(params string[] inputColumns) } } - public ColumnTypeConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public CategoricalOneHotVectorizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -9017,33 +9095,48 @@ public ColumnTypeConverter(params (string inputColumn, string outputColumn)[] in public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// New column definition(s) (optional form: name:type:src) + /// New column definition(s) (optional form: name:src) /// - public ConvertTransformColumn[] Column { get; set; } + public CategoricalTransformColumn[] Column { get; set; } /// - /// The result type + /// Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index) /// - public Microsoft.ML.Data.DataKind? ResultType { get; set; } + public CategoricalTransformOutputKind OutputKind { get; set; } = CategoricalTransformOutputKind.Ind; /// - /// For a key column, this defines the range of values + /// Maximum number of terms to keep per column when auto-training /// - public string Range { get; set; } + public int MaxNumTerms { get; set; } = 1000000; + + /// + /// List of terms + /// + public string[] Term { get; set; } + + /// + /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// + public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + + /// + /// Whether key value metadata should be text, regardless of the actual input type + /// + public bool TextKeyValues { get; set; } = true; /// /// Input dataset @@ -9072,18 +9165,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ColumnTypeConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(CategoricalOneHotVectorizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new ColumnTypeConverterPipelineStep(output); + return new CategoricalOneHotVectorizerPipelineStep(output); } - private class ColumnTypeConverterPipelineStep : ILearningPipelineDataStep + private class CategoricalOneHotVectorizerPipelineStep : ILearningPipelineDataStep { - public ColumnTypeConverterPipelineStep(Output output) + public CategoricalOneHotVectorizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -9098,33 +9191,87 @@ public ColumnTypeConverterPipelineStep(Output output) namespace Transforms { - /// - /// Groups values of a scalar column into a vector, by a contiguous group ID - /// - public sealed partial class CombinerByContiguousGroupId : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class CharTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn { - - /// - /// Columns to group by + /// Name of the new column /// - public string[] GroupKey { get; set; } + public string Name { get; set; } /// - /// Columns to group together + /// Name of the source column /// - public string[] Column { get; set; } + public string Source { get; set; } - /// - /// Input dataset - /// - public Var Data { get; set; } = new Var(); + } + /// + /// Character-oriented tokenizer where text is considered a sequence of characters. + /// + public sealed partial class CharacterTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public CharacterTokenizer() { - /// - /// Transformed dataset + } + + public CharacterTokenizer(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public CharacterTokenizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + + + /// + /// New column definition(s) (optional form: name:src) + /// + public CharTokenizeTransformColumn[] Column { get; set; } + + /// + /// Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03) + /// + public bool UseMarkerChars { get; set; } = true; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset /// public Var OutputData { get; set; } = new Var(); @@ -9142,18 +9289,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(CombinerByContiguousGroupId)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(CharacterTokenizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new CombinerByContiguousGroupIdPipelineStep(output); + return new CharacterTokenizerPipelineStep(output); } - private class CombinerByContiguousGroupIdPipelineStep : ILearningPipelineDataStep + private class CharacterTokenizerPipelineStep : ILearningPipelineDataStep { - public CombinerByContiguousGroupIdPipelineStep(Output output) + public CharacterTokenizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -9168,18 +9315,103 @@ public CombinerByContiguousGroupIdPipelineStep(Output output) namespace Transforms { - public sealed partial class NormalizeTransformAffineColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class ConcatTransformColumn : ManyToOneColumn, IManyToOneColumn { /// - /// Whether to map zero to zero, preserving sparsity + /// Name of the new column /// - public bool? FixZero { get; set; } + public string Name { get; set; } /// - /// Max number of examples used to train the normalizer + /// Name of the source column /// - public long? MaxTrainingExamples { get; set; } + public string[] Source { get; set; } + + } + + /// + /// Concatenates two columns of the same item type. + /// + public sealed partial class ColumnConcatenator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public ColumnConcatenator() + { + } + + public ColumnConcatenator(string outputColumn, params string[] inputColumns) + { + AddColumn(outputColumn, inputColumns); + } + + public void AddColumn(string name, params string[] source) + { + var list = Column == null ? new List() : new List(Column); + list.Add(ManyToOneColumn.Create(name, source)); + Column = list.ToArray(); + } + + + /// + /// New column definition(s) (optional form: name:srcs) + /// + public ConcatTransformColumn[] Column { get; set; } + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(ColumnConcatenator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new ColumnConcatenatorPipelineStep(output); + } + + private class ColumnConcatenatorPipelineStep : ILearningPipelineDataStep + { + public ColumnConcatenatorPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + public sealed partial class CopyColumnsTransformColumn : OneToOneColumn, IOneToOneColumn + { /// /// Name of the new column /// @@ -9193,16 +9425,16 @@ public sealed partial class NormalizeTransformAffineColumn : OneToOneColumn - /// Normalize the columns only if needed + /// Duplicates columns from the dataset /// - public sealed partial class ConditionalNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ColumnCopier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public ConditionalNormalizer() + public ColumnCopier() { } - public ConditionalNormalizer(params string[] inputColumns) + public ColumnCopier(params string[] inputColumns) { if (inputColumns != null) { @@ -9213,7 +9445,7 @@ public ConditionalNormalizer(params string[] inputColumns) } } - public ConditionalNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public ColumnCopier(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -9226,15 +9458,15 @@ public ConditionalNormalizer(params (string inputColumn, string outputColumn)[] public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } @@ -9242,17 +9474,7 @@ public void AddColumn(string outputColumn, string inputColumn) /// /// New column definition(s) (optional form: name:src) /// - public NormalizeTransformAffineColumn[] Column { get; set; } - - /// - /// Whether to map zero to zero, preserving sparsity - /// - public bool FixZero { get; set; } = true; - - /// - /// Max number of examples used to train the normalizer - /// - public long MaxTrainingExamples { get; set; } = 1000000000; + public CopyColumnsTransformColumn[] Column { get; set; } /// /// Input dataset @@ -9260,7 +9482,7 @@ public void AddColumn(string outputColumn, string inputColumn) public Var Data { get; set; } = new Var(); - public sealed class Output + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput { /// /// Transformed dataset @@ -9281,18 +9503,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ConditionalNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ColumnCopier)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new ConditionalNormalizerPipelineStep(output); + return new ColumnCopierPipelineStep(output); } - private class ConditionalNormalizerPipelineStep : ILearningPipelineDataStep + private class ColumnCopierPipelineStep : ILearningPipelineDataStep { - public ConditionalNormalizerPipelineStep(Output output) + public ColumnCopierPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -9306,24 +9528,18 @@ public ConditionalNormalizerPipelineStep(Output output) namespace Transforms { - public enum CacheCachingType - { - Memory = 0, - Disk = 1 - } - /// - /// Caches using the specified cache option. + /// Drops columns from the dataset /// - public sealed partial class DataCache : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ColumnDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Caching strategy + /// Column name to drop /// - public CacheCachingType Caching { get; set; } = CacheCachingType.Memory; + public string[] Column { get; set; } /// /// Input dataset @@ -9331,13 +9547,18 @@ public sealed partial class DataCache : Microsoft.ML.Runtime.EntryPoints.CommonI public Var Data { get; set; } = new Var(); - public sealed class Output + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput { /// - /// Dataset + /// Transformed dataset /// public Var OutputData { get; set; } = new Var(); + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + } public Var GetInputData() => Data; @@ -9347,20 +9568,21 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(DataCache)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ColumnDropper)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new DataCachePipelineStep(output); + return new ColumnDropperPipelineStep(output); } - private class DataCachePipelineStep : ILearningPipelineDataStep + private class ColumnDropperPipelineStep : ILearningPipelineDataStep { - public DataCachePipelineStep(Output output) + public ColumnDropperPipelineStep(Output output) { Data = output.OutputData; + Model = output.Model; } public Var Data { get; } @@ -9373,77 +9595,63 @@ namespace Transforms { /// - /// Score a dataset with a predictor model + /// Selects a set of columns, dropping all others /// - public sealed partial class DatasetScorer + public sealed partial class ColumnSelector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// The dataset to be scored - /// - public Var Data { get; set; } = new Var(); - - /// - /// The predictor model to apply to data + /// Column name to keep /// - public Var PredictorModel { get; set; } = new Var(); + public string[] Column { get; set; } /// - /// Suffix to append to the score columns + /// Input dataset /// - public string Suffix { get; set; } + public Var Data { get; set; } = new Var(); - public sealed class Output + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput { /// - /// The scored dataset + /// Transformed dataset /// - public Var ScoredData { get; set; } = new Var(); + public Var OutputData { get; set; } = new Var(); /// - /// The scoring transform + /// Transform model /// - public Var ScoringTransform { get; set; } = new Var(); + public Var Model { get; set; } = new Var(); } - } - } - - namespace Transforms - { - - /// - /// Score a dataset with a transform model - /// - public sealed partial class DatasetTransformScorer - { - - - /// - /// The dataset to be scored - /// - public Var Data { get; set; } = new Var(); - - /// - /// The transform model to apply to data - /// - public Var TransformModel { get; set; } = new Var(); + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(ColumnSelector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new ColumnSelectorPipelineStep(output); + } - public sealed class Output + private class ColumnSelectorPipelineStep : ILearningPipelineDataStep { - /// - /// The scored dataset - /// - public Var ScoredData { get; set; } = new Var(); - - /// - /// The scoring transform - /// - public Var ScoringTransform { get; set; } = new Var(); + public ColumnSelectorPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + public Var Data { get; } + public Var Model { get; } } } } @@ -9451,27 +9659,17 @@ public sealed class Output namespace Transforms { - public sealed partial class TermTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class ConvertTransformColumn : OneToOneColumn, IOneToOneColumn { /// - /// Maximum number of terms to keep when auto-training - /// - public int? MaxNumTerms { get; set; } - - /// - /// List of terms - /// - public string[] Term { get; set; } - - /// - /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// The result type /// - public TermTransformSortOrder? Sort { get; set; } + public Microsoft.ML.Data.DataKind? ResultType { get; set; } /// - /// Whether key value metadata should be text, regardless of the actual input type + /// For a key column, this defines the range of values /// - public bool? TextKeyValues { get; set; } + public string Range { get; set; } /// /// Name of the new column @@ -9486,16 +9684,16 @@ public sealed partial class TermTransformColumn : OneToOneColumn - /// Converts input values (words, numbers, etc.) to index in a dictionary. + /// Converts a column to a different type, using standard conversions. /// - public sealed partial class Dictionarizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ColumnTypeConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public Dictionarizer() + public ColumnTypeConverter() { } - public Dictionarizer(params string[] inputColumns) + public ColumnTypeConverter(params string[] inputColumns) { if (inputColumns != null) { @@ -9506,7 +9704,7 @@ public Dictionarizer(params string[] inputColumns) } } - public Dictionarizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public ColumnTypeConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -9519,43 +9717,33 @@ public Dictionarizer(params (string inputColumn, string outputColumn)[] inputOut public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// New column definition(s) (optional form: name:src) - /// - public TermTransformColumn[] Column { get; set; } - - /// - /// Maximum number of terms to keep per column when auto-training - /// - public int MaxNumTerms { get; set; } = 1000000; - - /// - /// List of terms + /// New column definition(s) (optional form: name:type:src) /// - public string[] Term { get; set; } + public ConvertTransformColumn[] Column { get; set; } /// - /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// The result type /// - public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + public Microsoft.ML.Data.DataKind? ResultType { get; set; } /// - /// Whether key value metadata should be text, regardless of the actual input type + /// For a key column, this defines the range of values /// - public bool TextKeyValues { get; set; } = false; + public string Range { get; set; } /// /// Input dataset @@ -9584,18 +9772,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(Dictionarizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ColumnTypeConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new DictionarizerPipelineStep(output); + return new ColumnTypeConverterPipelineStep(output); } - private class DictionarizerPipelineStep : ILearningPipelineDataStep + private class ColumnTypeConverterPipelineStep : ILearningPipelineDataStep { - public DictionarizerPipelineStep(Output output) + public ColumnTypeConverterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -9611,16 +9799,21 @@ namespace Transforms { /// - /// Combines all the features into one feature column. + /// Groups values of a scalar column into a vector, by a contiguous group ID /// - public sealed partial class FeatureCombiner : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class CombinerByContiguousGroupId : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Features + /// Columns to group by /// - public string[] Features { get; set; } + public string[] GroupKey { get; set; } + + /// + /// Columns to group together + /// + public string[] Column { get; set; } /// /// Input dataset @@ -9649,18 +9842,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FeatureCombiner)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(CombinerByContiguousGroupId)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new FeatureCombinerPipelineStep(output); + return new CombinerByContiguousGroupIdPipelineStep(output); } - private class FeatureCombinerPipelineStep : ILearningPipelineDataStep + private class CombinerByContiguousGroupIdPipelineStep : ILearningPipelineDataStep { - public FeatureCombinerPipelineStep(Output output) + public CombinerByContiguousGroupIdPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -9675,22 +9868,91 @@ public FeatureCombinerPipelineStep(Output output) namespace Transforms { + public sealed partial class NormalizeTransformAffineColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// Whether to map zero to zero, preserving sparsity + /// + public bool? FixZero { get; set; } + + /// + /// Max number of examples used to train the normalizer + /// + public long? MaxTrainingExamples { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + /// - /// Selects the slots for which the count of non-default values is greater than or equal to a threshold. + /// Normalize the columns only if needed /// - public sealed partial class FeatureSelectorByCount : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ConditionalNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { + public ConditionalNormalizer() + { + } + + public ConditionalNormalizer(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public ConditionalNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + /// - /// Columns to use for feature selection + /// New column definition(s) (optional form: name:src) /// - public string[] Column { get; set; } + public NormalizeTransformAffineColumn[] Column { get; set; } /// - /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved + /// Whether to map zero to zero, preserving sparsity /// - public long Count { get; set; } = 1; + public bool FixZero { get; set; } = true; + + /// + /// Max number of examples used to train the normalizer + /// + public long MaxTrainingExamples { get; set; } = 1000000000; /// /// Input dataset @@ -9698,7 +9960,7 @@ public sealed partial class FeatureSelectorByCount : Microsoft.ML.Runtime.EntryP public Var Data { get; set; } = new Var(); - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output { /// /// Transformed dataset @@ -9719,18 +9981,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FeatureSelectorByCount)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ConditionalNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new FeatureSelectorByCountPipelineStep(output); + return new ConditionalNormalizerPipelineStep(output); } - private class FeatureSelectorByCountPipelineStep : ILearningPipelineDataStep + private class ConditionalNormalizerPipelineStep : ILearningPipelineDataStep { - public FeatureSelectorByCountPipelineStep(Output output) + public ConditionalNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -9744,52 +10006,38 @@ public FeatureSelectorByCountPipelineStep(Output output) namespace Transforms { + public enum CacheCachingType + { + Memory = 0, + Disk = 1 + } + /// - /// Selects the top k slots across all specified columns ordered by their mutual information with the label column. + /// Caches using the specified cache option. /// - public sealed partial class FeatureSelectorByMutualInformation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class DataCache : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Columns to use for feature selection + /// Caching strategy /// - public string[] Column { get; set; } + public CacheCachingType Caching { get; set; } = CacheCachingType.Memory; /// - /// Column to use for labels - /// - public string LabelColumn { get; set; } = "Label"; - - /// - /// The maximum number of slots to preserve in output - /// - public int SlotsInOutput { get; set; } = 1000; - - /// - /// Max number of bins for R4/R8 columns, power of 2 recommended - /// - public int NumBins { get; set; } = 256; - - /// - /// Input dataset + /// Input dataset /// public Var Data { get; set; } = new Var(); - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output { /// - /// Transformed dataset + /// Dataset /// public Var OutputData { get; set; } = new Var(); - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); - } public Var GetInputData() => Data; @@ -9799,21 +10047,20 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(FeatureSelectorByMutualInformation)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(DataCache)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new FeatureSelectorByMutualInformationPipelineStep(output); + return new DataCachePipelineStep(output); } - private class FeatureSelectorByMutualInformationPipelineStep : ILearningPipelineDataStep + private class DataCachePipelineStep : ILearningPipelineDataStep { - public FeatureSelectorByMutualInformationPipelineStep(Output output) + public DataCachePipelineStep(Output output) { Data = output.OutputData; - Model = output.Model; } public Var Data { get; } @@ -9825,148 +10072,78 @@ public FeatureSelectorByMutualInformationPipelineStep(Output output) namespace Transforms { - public sealed partial class LpNormNormalizerTransformGcnColumn : OneToOneColumn, IOneToOneColumn + /// + /// Score a dataset with a predictor model + /// + public sealed partial class DatasetScorer { - /// - /// Normalize by standard deviation rather than L2 norm - /// - public bool? UseStdDev { get; set; } - /// - /// Scale features by this value - /// - public float? Scale { get; set; } /// - /// Subtract mean from each value before normalizing + /// The dataset to be scored /// - public bool? SubMean { get; set; } + public Var Data { get; set; } = new Var(); /// - /// Name of the new column + /// The predictor model to apply to data /// - public string Name { get; set; } + public Var PredictorModel { get; set; } = new Var(); /// - /// Name of the source column + /// Suffix to append to the score columns /// - public string Source { get; set; } - - } + public string Suffix { get; set; } - /// - /// Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation. - /// - public sealed partial class GlobalContrastNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { - public GlobalContrastNormalizer() - { - } - - public GlobalContrastNormalizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public GlobalContrastNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) + public sealed class Output { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } + /// + /// The scored dataset + /// + public Var ScoredData { get; set; } = new Var(); - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } + /// + /// The scoring transform + /// + public Var ScoringTransform { get; set; } = new Var(); + } + } + } - /// - /// New column definition(s) (optional form: name:src) - /// - public LpNormNormalizerTransformGcnColumn[] Column { get; set; } + namespace Transforms + { - /// - /// Subtract mean from each value before normalizing - /// - public bool SubMean { get; set; } = true; + /// + /// Score a dataset with a transform model + /// + public sealed partial class DatasetTransformScorer + { - /// - /// Normalize by standard deviation rather than L2 norm - /// - public bool UseStdDev { get; set; } = false; /// - /// Scale features by this value + /// The dataset to be scored /// - public float Scale { get; set; } = 1f; + public Var Data { get; set; } = new Var(); /// - /// Input dataset + /// The transform model to apply to data /// - public Var Data { get; set; } = new Var(); + public Var TransformModel { get; set; } = new Var(); - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output { /// - /// Transformed dataset + /// The scored dataset /// - public Var OutputData { get; set; } = new Var(); + public Var ScoredData { get; set; } = new Var(); /// - /// Transform model + /// The scoring transform /// - public Var Model { get; set; } = new Var(); - - } - public Var GetInputData() => Data; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(GlobalContrastNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - Data = dataStep.Data; - } - Output output = experiment.Add(this); - return new GlobalContrastNormalizerPipelineStep(output); - } - - private class GlobalContrastNormalizerPipelineStep : ILearningPipelineDataStep - { - public GlobalContrastNormalizerPipelineStep(Output output) - { - Data = output.OutputData; - Model = output.Model; - } + public Var ScoringTransform { get; set; } = new Var(); - public Var Data { get; } - public Var Model { get; } } } } @@ -9974,32 +10151,27 @@ public GlobalContrastNormalizerPipelineStep(Output output) namespace Transforms { - public sealed partial class HashJoinTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class TermTransformColumn : OneToOneColumn, IOneToOneColumn { /// - /// Whether the values need to be combined for a single hash - /// - public bool? Join { get; set; } - - /// - /// Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'. + /// Maximum number of terms to keep when auto-training /// - public string CustomSlotMap { get; set; } + public int? MaxNumTerms { get; set; } /// - /// Number of bits to hash into. Must be between 1 and 31, inclusive. + /// List of terms /// - public int? HashBits { get; set; } + public string[] Term { get; set; } /// - /// Hashing seed + /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public uint? Seed { get; set; } + public TermTransformSortOrder? Sort { get; set; } /// - /// Whether the position of each term should be included in the hash + /// Whether key value metadata should be text, regardless of the actual input type /// - public bool? Ordered { get; set; } + public bool? TextKeyValues { get; set; } /// /// Name of the new column @@ -10014,16 +10186,16 @@ public sealed partial class HashJoinTransformColumn : OneToOneColumn - /// Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform. + /// Converts input values (words, numbers, etc.) to index in a dictionary. /// - public sealed partial class HashConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class Dictionarizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public HashConverter() + public Dictionarizer() { } - public HashConverter(params string[] inputColumns) + public Dictionarizer(params string[] inputColumns) { if (inputColumns != null) { @@ -10034,7 +10206,7 @@ public HashConverter(params string[] inputColumns) } } - public HashConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public Dictionarizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -10047,15 +10219,15 @@ public HashConverter(params (string inputColumn, string outputColumn)[] inputOut public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } @@ -10063,27 +10235,27 @@ public void AddColumn(string outputColumn, string inputColumn) /// /// New column definition(s) (optional form: name:src) /// - public HashJoinTransformColumn[] Column { get; set; } + public TermTransformColumn[] Column { get; set; } /// - /// Whether the values need to be combined for a single hash + /// Maximum number of terms to keep per column when auto-training /// - public bool Join { get; set; } = true; + public int MaxNumTerms { get; set; } = 1000000; /// - /// Number of bits to hash into. Must be between 1 and 31, inclusive. + /// List of terms /// - public int HashBits { get; set; } = 31; + public string[] Term { get; set; } /// - /// Hashing seed + /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). /// - public uint Seed { get; set; } = 314489979; + public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; /// - /// Whether the position of each term should be included in the hash + /// Whether key value metadata should be text, regardless of the actual input type /// - public bool Ordered { get; set; } = true; + public bool TextKeyValues { get; set; } = false; /// /// Input dataset @@ -10112,18 +10284,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(HashConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(Dictionarizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new HashConverterPipelineStep(output); + return new DictionarizerPipelineStep(output); } - private class HashConverterPipelineStep : ILearningPipelineDataStep + private class DictionarizerPipelineStep : ILearningPipelineDataStep { - public HashConverterPipelineStep(Output output) + public DictionarizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -10138,71 +10310,87 @@ public HashConverterPipelineStep(Output output) namespace Transforms { - public sealed partial class KeyToValueTransformColumn : OneToOneColumn, IOneToOneColumn + /// + /// Combines all the features into one feature column. + /// + public sealed partial class FeatureCombiner : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - /// - /// Name of the new column - /// - public string Name { get; set; } + /// - /// Name of the source column + /// Features /// - public string Source { get; set; } + public string[] Features { get; set; } - } + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); - /// - /// KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata. - /// - public sealed partial class KeyToTextConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { - public KeyToTextConverter() + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + } + public Var GetInputData() => Data; - public KeyToTextConverter(params string[] inputColumns) + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { - if (inputColumns != null) + if (previousStep != null) { - foreach (string input in inputColumns) + if (!(previousStep is ILearningPipelineDataStep dataStep)) { - AddColumn(input); + throw new InvalidOperationException($"{ nameof(FeatureCombiner)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } + + Data = dataStep.Data; } + Output output = experiment.Add(this); + return new FeatureCombinerPipelineStep(output); } - - public KeyToTextConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) + + private class FeatureCombinerPipelineStep : ILearningPipelineDataStep { - if (inputOutputColumns != null) + public FeatureCombinerPipelineStep(Output output) { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } + Data = output.OutputData; + Model = output.Model; } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); + public Var Data { get; } + public Var Model { get; } } + } + } + + namespace Transforms + { + + /// + /// Selects the slots for which the count of non-default values is greater than or equal to a threshold. + /// + public sealed partial class FeatureSelectorByCount : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { /// - /// New column definition(s) (optional form: name:src) + /// Columns to use for feature selection /// - public KeyToValueTransformColumn[] Column { get; set; } + public string[] Column { get; set; } + + /// + /// If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved + /// + public long Count { get; set; } = 1; /// /// Input dataset @@ -10231,18 +10419,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(KeyToTextConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FeatureSelectorByCount)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new KeyToTextConverterPipelineStep(output); + return new FeatureSelectorByCountPipelineStep(output); } - private class KeyToTextConverterPipelineStep : ILearningPipelineDataStep + private class FeatureSelectorByCountPipelineStep : ILearningPipelineDataStep { - public KeyToTextConverterPipelineStep(Output output) + public FeatureSelectorByCountPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -10258,21 +10446,31 @@ namespace Transforms { /// - /// Transforms the label to either key or bool (if needed) to make it suitable for classification. + /// Selects the top k slots across all specified columns ordered by their mutual information with the label column. /// - public sealed partial class LabelColumnKeyBooleanConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class FeatureSelectorByMutualInformation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Convert the key values to text + /// Columns to use for feature selection /// - public bool TextKeyValues { get; set; } = true; + public string[] Column { get; set; } /// - /// The label column + /// Column to use for labels /// - public string LabelColumn { get; set; } + public string LabelColumn { get; set; } = "Label"; + + /// + /// The maximum number of slots to preserve in output + /// + public int SlotsInOutput { get; set; } = 1000; + + /// + /// Max number of bins for R4/R8 columns, power of 2 recommended + /// + public int NumBins { get; set; } = 256; /// /// Input dataset @@ -10301,18 +10499,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LabelColumnKeyBooleanConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(FeatureSelectorByMutualInformation)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new LabelColumnKeyBooleanConverterPipelineStep(output); + return new FeatureSelectorByMutualInformationPipelineStep(output); } - private class LabelColumnKeyBooleanConverterPipelineStep : ILearningPipelineDataStep + private class FeatureSelectorByMutualInformationPipelineStep : ILearningPipelineDataStep { - public LabelColumnKeyBooleanConverterPipelineStep(Output output) + public FeatureSelectorByMutualInformationPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -10327,12 +10525,22 @@ public LabelColumnKeyBooleanConverterPipelineStep(Output output) namespace Transforms { - public sealed partial class LabelIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class LpNormNormalizerTransformGcnColumn : OneToOneColumn, IOneToOneColumn { /// - /// The positive example class for binary classification. + /// Normalize by standard deviation rather than L2 norm /// - public int? ClassIndex { get; set; } + public bool? UseStdDev { get; set; } + + /// + /// Scale features by this value + /// + public float? Scale { get; set; } + + /// + /// Subtract mean from each value before normalizing + /// + public bool? SubMean { get; set; } /// /// Name of the new column @@ -10347,16 +10555,16 @@ public sealed partial class LabelIndicatorTransformColumn : OneToOneColumn - public sealed partial class LabelIndicator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class GlobalContrastNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public LabelIndicator() + public GlobalContrastNormalizer() { } - public LabelIndicator(params string[] inputColumns) + public GlobalContrastNormalizer(params string[] inputColumns) { if (inputColumns != null) { @@ -10367,7 +10575,7 @@ public LabelIndicator(params string[] inputColumns) } } - public LabelIndicator(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public GlobalContrastNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -10380,15 +10588,15 @@ public LabelIndicator(params (string inputColumn, string outputColumn)[] inputOu public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } @@ -10396,12 +10604,22 @@ public void AddColumn(string outputColumn, string inputColumn) /// /// New column definition(s) (optional form: name:src) /// - public LabelIndicatorTransformColumn[] Column { get; set; } + public LpNormNormalizerTransformGcnColumn[] Column { get; set; } /// - /// Label of the positive class. + /// Subtract mean from each value before normalizing /// - public int ClassIndex { get; set; } + public bool SubMean { get; set; } = true; + + /// + /// Normalize by standard deviation rather than L2 norm + /// + public bool UseStdDev { get; set; } = false; + + /// + /// Scale features by this value + /// + public float Scale { get; set; } = 1f; /// /// Input dataset @@ -10430,18 +10648,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LabelIndicator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(GlobalContrastNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new LabelIndicatorPipelineStep(output); + return new GlobalContrastNormalizerPipelineStep(output); } - private class LabelIndicatorPipelineStep : ILearningPipelineDataStep + private class GlobalContrastNormalizerPipelineStep : ILearningPipelineDataStep { - public LabelIndicatorPipelineStep(Output output) + public GlobalContrastNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -10456,164 +10674,69 @@ public LabelIndicatorPipelineStep(Output output) namespace Transforms { - /// - /// Transforms the label to float to make it suitable for regression. - /// - public sealed partial class LabelToFloatConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class HashJoinTransformColumn : OneToOneColumn, IOneToOneColumn { + /// + /// Whether the values need to be combined for a single hash + /// + public bool? Join { get; set; } + + /// + /// Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'. + /// + public string CustomSlotMap { get; set; } + /// + /// Number of bits to hash into. Must be between 1 and 31, inclusive. + /// + public int? HashBits { get; set; } /// - /// The label column + /// Hashing seed /// - public string LabelColumn { get; set; } + public uint? Seed { get; set; } /// - /// Input dataset + /// Whether the position of each term should be included in the hash /// - public Var Data { get; set; } = new Var(); + public bool? Ordered { get; set; } + /// + /// Name of the new column + /// + public string Name { get; set; } - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput - { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); + /// + /// Name of the source column + /// + public string Source { get; set; } - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); + } + + /// + /// Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform. + /// + public sealed partial class HashConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + public HashConverter() + { } - public Var GetInputData() => Data; - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + public HashConverter(params string[] inputColumns) { - if (previousStep != null) + if (inputColumns != null) { - if (!(previousStep is ILearningPipelineDataStep dataStep)) + foreach (string input in inputColumns) { - throw new InvalidOperationException($"{ nameof(LabelToFloatConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + AddColumn(input); } - - Data = dataStep.Data; } - Output output = experiment.Add(this); - return new LabelToFloatConverterPipelineStep(output); } - - private class LabelToFloatConverterPipelineStep : ILearningPipelineDataStep + + public HashConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) { - public LabelToFloatConverterPipelineStep(Output output) - { - Data = output.OutputData; - Model = output.Model; - } - - public Var Data { get; } - public Var Model { get; } - } - } - } - - namespace Transforms - { - - public sealed partial class LdaTransformColumn : OneToOneColumn, IOneToOneColumn - { - /// - /// The number of topics in the LDA - /// - public int? NumTopic { get; set; } - - /// - /// Dirichlet prior on document-topic vectors - /// - public float? AlphaSum { get; set; } - - /// - /// Dirichlet prior on vocab-topic vectors - /// - public float? Beta { get; set; } - - /// - /// Number of Metropolis Hasting step - /// - public int? Mhstep { get; set; } - - /// - /// Number of iterations - /// - public int? NumIterations { get; set; } - - /// - /// Compute log likelihood over local dataset on this iteration interval - /// - public int? LikelihoodInterval { get; set; } - - /// - /// The number of training threads - /// - public int? NumThreads { get; set; } - - /// - /// The threshold of maximum count of tokens per doc - /// - public int? NumMaxDocToken { get; set; } - - /// - /// The number of words to summarize the topic - /// - public int? NumSummaryTermPerTopic { get; set; } - - /// - /// The number of burn-in iterations - /// - public int? NumBurninIterations { get; set; } = 10; - - /// - /// Reset the random number generator for each document - /// - public bool? ResetRandomGenerator { get; set; } - - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string Source { get; set; } - - } - - /// - /// The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation. - /// - public sealed partial class LightLda : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { - - public LightLda() - { - } - - public LightLda(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public LightLda(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) + if (inputOutputColumns != null) { foreach (var inputOutput in inputOutputColumns) { @@ -10624,89 +10747,43 @@ public LightLda(params (string inputColumn, string outputColumn)[] inputOutputCo public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// New column definition(s) (optional form: name:srcs) - /// - public LdaTransformColumn[] Column { get; set; } - - /// - /// The number of topics in the LDA - /// - [TlcModule.SweepableDiscreteParamAttribute("NumTopic", new object[]{20, 40, 100, 200})] - public int NumTopic { get; set; } = 100; - - /// - /// Dirichlet prior on document-topic vectors - /// - [TlcModule.SweepableDiscreteParamAttribute("AlphaSum", new object[]{1, 10, 100, 200})] - public float AlphaSum { get; set; } = 100f; - - /// - /// Dirichlet prior on vocab-topic vectors - /// - [TlcModule.SweepableDiscreteParamAttribute("Beta", new object[]{0.01f, 0.015f, 0.07f, 0.02f})] - public float Beta { get; set; } = 0.01f; - - /// - /// Number of Metropolis Hasting step - /// - [TlcModule.SweepableDiscreteParamAttribute("Mhstep", new object[]{2, 4, 8, 16})] - public int Mhstep { get; set; } = 4; - - /// - /// Number of iterations - /// - [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{100, 200, 300, 400})] - public int NumIterations { get; set; } = 200; - - /// - /// Compute log likelihood over local dataset on this iteration interval - /// - public int LikelihoodInterval { get; set; } = 5; - - /// - /// The threshold of maximum count of tokens per doc - /// - public int NumMaxDocToken { get; set; } = 512; - - /// - /// The number of training threads. Default value depends on number of logical processors. + /// New column definition(s) (optional form: name:src) /// - public int? NumThreads { get; set; } + public HashJoinTransformColumn[] Column { get; set; } /// - /// The number of words to summarize the topic + /// Whether the values need to be combined for a single hash /// - public int NumSummaryTermPerTopic { get; set; } = 10; + public bool Join { get; set; } = true; /// - /// The number of burn-in iterations + /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// - [TlcModule.SweepableDiscreteParamAttribute("NumBurninIterations", new object[]{10, 20, 30, 40})] - public int NumBurninIterations { get; set; } = 10; + public int HashBits { get; set; } = 31; /// - /// Reset the random number generator for each document + /// Hashing seed /// - public bool ResetRandomGenerator { get; set; } = false; + public uint Seed { get; set; } = 314489979; /// - /// Whether to output the topic-word summary in text format + /// Whether the position of each term should be included in the hash /// - public bool OutputTopicWordSummary { get; set; } = false; + public bool Ordered { get; set; } = true; /// /// Input dataset @@ -10735,18 +10812,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LightLda)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(HashConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new LightLdaPipelineStep(output); + return new HashConverterPipelineStep(output); } - private class LightLdaPipelineStep : ILearningPipelineDataStep + private class HashConverterPipelineStep : ILearningPipelineDataStep { - public LightLdaPipelineStep(Output output) + public HashConverterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -10761,13 +10838,8 @@ public LightLdaPipelineStep(Output output) namespace Transforms { - public sealed partial class NormalizeTransformLogNormalColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class KeyToValueTransformColumn : OneToOneColumn, IOneToOneColumn { - /// - /// Max number of examples used to train the normalizer - /// - public long? MaxTrainingExamples { get; set; } - /// /// Name of the new column /// @@ -10781,16 +10853,16 @@ public sealed partial class NormalizeTransformLogNormalColumn : OneToOneColumn - /// Normalizes the data based on the computed mean and variance of the logarithm of the data. + /// KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata. /// - public sealed partial class LogMeanVarianceNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class KeyToTextConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public LogMeanVarianceNormalizer() + public KeyToTextConverter() { } - public LogMeanVarianceNormalizer(params string[] inputColumns) + public KeyToTextConverter(params string[] inputColumns) { if (inputColumns != null) { @@ -10801,7 +10873,7 @@ public LogMeanVarianceNormalizer(params string[] inputColumns) } } - public LogMeanVarianceNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public KeyToTextConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -10814,33 +10886,23 @@ public LogMeanVarianceNormalizer(params (string inputColumn, string outputColumn public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } - /// - /// Whether to use CDF as the output - /// - public bool UseCdf { get; set; } = true; - /// /// New column definition(s) (optional form: name:src) /// - public NormalizeTransformLogNormalColumn[] Column { get; set; } - - /// - /// Max number of examples used to train the normalizer - /// - public long MaxTrainingExamples { get; set; } = 1000000000; + public KeyToValueTransformColumn[] Column { get; set; } /// /// Input dataset @@ -10869,18 +10931,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LogMeanVarianceNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(KeyToTextConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new LogMeanVarianceNormalizerPipelineStep(output); + return new KeyToTextConverterPipelineStep(output); } - private class LogMeanVarianceNormalizerPipelineStep : ILearningPipelineDataStep + private class KeyToTextConverterPipelineStep : ILearningPipelineDataStep { - public LogMeanVarianceNormalizerPipelineStep(Output output) + public KeyToTextConverterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -10894,103 +10956,26 @@ public LogMeanVarianceNormalizerPipelineStep(Output output) namespace Transforms { - public enum LpNormNormalizerTransformNormalizerKind : byte + + /// + /// Transforms the label to either key or bool (if needed) to make it suitable for classification. + /// + public sealed partial class LabelColumnKeyBooleanConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - L2Norm = 0, - StdDev = 1, - L1Norm = 2, - LInf = 3 - } - public sealed partial class LpNormNormalizerTransformColumn : OneToOneColumn, IOneToOneColumn - { /// - /// The norm to use to normalize each sample + /// Convert the key values to text /// - public LpNormNormalizerTransformNormalizerKind? NormKind { get; set; } + public bool TextKeyValues { get; set; } = true; /// - /// Subtract mean from each value before normalizing + /// The label column /// - public bool? SubMean { get; set; } + public string LabelColumn { get; set; } /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string Source { get; set; } - - } - - /// - /// Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. - /// - public sealed partial class LpNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { - - public LpNormalizer() - { - } - - public LpNormalizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public LpNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } - - - /// - /// New column definition(s) (optional form: name:src) - /// - public LpNormNormalizerTransformColumn[] Column { get; set; } - - /// - /// The norm to use to normalize each sample - /// - public LpNormNormalizerTransformNormalizerKind NormKind { get; set; } = LpNormNormalizerTransformNormalizerKind.L2Norm; - - /// - /// Subtract mean from each value before normalizing - /// - public bool SubMean { get; set; } = false; - - /// - /// Input dataset + /// Input dataset /// public Var Data { get; set; } = new Var(); @@ -11016,18 +11001,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(LpNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LabelColumnKeyBooleanConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new LpNormalizerPipelineStep(output); + return new LabelColumnKeyBooleanConverterPipelineStep(output); } - private class LpNormalizerPipelineStep : ILearningPipelineDataStep + private class LabelColumnKeyBooleanConverterPipelineStep : ILearningPipelineDataStep { - public LpNormalizerPipelineStep(Output output) + public LabelColumnKeyBooleanConverterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11042,49 +11027,36 @@ public LpNormalizerPipelineStep(Output output) namespace Transforms { - /// - /// Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel. - /// - public sealed partial class ManyHeterogeneousModelCombiner + public sealed partial class LabelIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn { - - /// - /// Transform model + /// The positive example class for binary classification. /// - public ArrayVar TransformModels { get; set; } = new ArrayVar(); + public int? ClassIndex { get; set; } /// - /// Predictor model + /// Name of the new column /// - public Var PredictorModel { get; set; } = new Var(); - + public string Name { get; set; } - public sealed class Output - { - /// - /// Predictor model - /// - public Var PredictorModel { get; set; } = new Var(); + /// + /// Name of the source column + /// + public string Source { get; set; } - } } - } - - namespace Transforms - { /// - /// Normalizes the data based on the computed mean and variance of the data. + /// Label remapper used by OVA /// - public sealed partial class MeanVarianceNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LabelIndicator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public MeanVarianceNormalizer() + public LabelIndicator() { } - public MeanVarianceNormalizer(params string[] inputColumns) + public LabelIndicator(params string[] inputColumns) { if (inputColumns != null) { @@ -11095,7 +11067,7 @@ public MeanVarianceNormalizer(params string[] inputColumns) } } - public MeanVarianceNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public LabelIndicator(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -11108,38 +11080,28 @@ public MeanVarianceNormalizer(params (string inputColumn, string outputColumn)[] public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } - /// - /// Whether to use CDF as the output - /// - public bool UseCdf { get; set; } = false; - /// /// New column definition(s) (optional form: name:src) /// - public NormalizeTransformAffineColumn[] Column { get; set; } - - /// - /// Whether to map zero to zero, preserving sparsity - /// - public bool FixZero { get; set; } = true; + public LabelIndicatorTransformColumn[] Column { get; set; } /// - /// Max number of examples used to train the normalizer + /// Label of the positive class. /// - public long MaxTrainingExamples { get; set; } = 1000000000; + public int ClassIndex { get; set; } /// /// Input dataset @@ -11168,18 +11130,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(MeanVarianceNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LabelIndicator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new MeanVarianceNormalizerPipelineStep(output); + return new LabelIndicatorPipelineStep(output); } - private class MeanVarianceNormalizerPipelineStep : ILearningPipelineDataStep + private class LabelIndicatorPipelineStep : ILearningPipelineDataStep { - public MeanVarianceNormalizerPipelineStep(Output output) + public LabelIndicatorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11195,66 +11157,16 @@ namespace Transforms { /// - /// Normalizes the data based on the observed minimum and maximum values of the data. + /// Transforms the label to float to make it suitable for regression. /// - public sealed partial class MinMaxNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LabelToFloatConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public MinMaxNormalizer() - { - } - - public MinMaxNormalizer(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public MinMaxNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } - - - /// - /// New column definition(s) (optional form: name:src) - /// - public NormalizeTransformAffineColumn[] Column { get; set; } - - /// - /// Whether to map zero to zero, preserving sparsity - /// - public bool FixZero { get; set; } = true; /// - /// Max number of examples used to train the normalizer + /// The label column /// - public long MaxTrainingExamples { get; set; } = 1000000000; + public string LabelColumn { get; set; } /// /// Input dataset @@ -11283,18 +11195,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(MinMaxNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LabelToFloatConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new MinMaxNormalizerPipelineStep(output); + return new LabelToFloatConverterPipelineStep(output); } - private class MinMaxNormalizerPipelineStep : ILearningPipelineDataStep + private class LabelToFloatConverterPipelineStep : ILearningPipelineDataStep { - public MinMaxNormalizerPipelineStep(Output output) + public LabelToFloatConverterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11308,55 +11220,87 @@ public MinMaxNormalizerPipelineStep(Output output) namespace Transforms { - public enum NAHandleTransformReplacementKind - { - DefaultValue = 0, - Mean = 1, - Minimum = 2, - Maximum = 3 - } - - public sealed partial class NAHandleTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class LdaTransformColumn : OneToOneColumn, IOneToOneColumn { /// - /// The replacement method to utilize + /// The number of topics in the LDA /// - public NAHandleTransformReplacementKind? Kind { get; set; } + public int? NumTopic { get; set; } /// - /// Whether to impute values by slot + /// Dirichlet prior on document-topic vectors /// - public bool? ImputeBySlot { get; set; } + public float? AlphaSum { get; set; } /// - /// Whether or not to concatenate an indicator vector column to the value column + /// Dirichlet prior on vocab-topic vectors /// - public bool? ConcatIndicator { get; set; } + public float? Beta { get; set; } /// - /// Name of the new column + /// Number of Metropolis Hasting step /// - public string Name { get; set; } + public int? Mhstep { get; set; } /// - /// Name of the source column + /// Number of iterations /// - public string Source { get; set; } + public int? NumIterations { get; set; } - } + /// + /// Compute log likelihood over local dataset on this iteration interval + /// + public int? LikelihoodInterval { get; set; } - /// - /// Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric. + /// + /// The number of training threads + /// + public int? NumThreads { get; set; } + + /// + /// The threshold of maximum count of tokens per doc + /// + public int? NumMaxDocToken { get; set; } + + /// + /// The number of words to summarize the topic + /// + public int? NumSummaryTermPerTopic { get; set; } + + /// + /// The number of burn-in iterations + /// + public int? NumBurninIterations { get; set; } = 10; + + /// + /// Reset the random number generator for each document + /// + public bool? ResetRandomGenerator { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + + /// + /// The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation. /// - public sealed partial class MissingValueHandler : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LightLda : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public MissingValueHandler() + public LightLda() { } - public MissingValueHandler(params string[] inputColumns) + public LightLda(params string[] inputColumns) { if (inputColumns != null) { @@ -11367,7 +11311,7 @@ public MissingValueHandler(params string[] inputColumns) } } - public MissingValueHandler(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public LightLda(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -11380,38 +11324,89 @@ public MissingValueHandler(params (string inputColumn, string outputColumn)[] in public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// New column definition(s) (optional form: name:rep:src) + /// New column definition(s) (optional form: name:srcs) /// - public NAHandleTransformColumn[] Column { get; set; } + public LdaTransformColumn[] Column { get; set; } /// - /// The replacement method to utilize + /// The number of topics in the LDA /// - public NAHandleTransformReplacementKind ReplaceWith { get; set; } = NAHandleTransformReplacementKind.DefaultValue; + [TlcModule.SweepableDiscreteParamAttribute("NumTopic", new object[]{20, 40, 100, 200})] + public int NumTopic { get; set; } = 100; /// - /// Whether to impute values by slot + /// Dirichlet prior on document-topic vectors /// - public bool ImputeBySlot { get; set; } = true; + [TlcModule.SweepableDiscreteParamAttribute("AlphaSum", new object[]{1, 10, 100, 200})] + public float AlphaSum { get; set; } = 100f; /// - /// Whether or not to concatenate an indicator vector column to the value column + /// Dirichlet prior on vocab-topic vectors /// - public bool Concat { get; set; } = true; + [TlcModule.SweepableDiscreteParamAttribute("Beta", new object[]{0.01f, 0.015f, 0.07f, 0.02f})] + public float Beta { get; set; } = 0.01f; + + /// + /// Number of Metropolis Hasting step + /// + [TlcModule.SweepableDiscreteParamAttribute("Mhstep", new object[]{2, 4, 8, 16})] + public int Mhstep { get; set; } = 4; + + /// + /// Number of iterations + /// + [TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[]{100, 200, 300, 400})] + public int NumIterations { get; set; } = 200; + + /// + /// Compute log likelihood over local dataset on this iteration interval + /// + public int LikelihoodInterval { get; set; } = 5; + + /// + /// The threshold of maximum count of tokens per doc + /// + public int NumMaxDocToken { get; set; } = 512; + + /// + /// The number of training threads. Default value depends on number of logical processors. + /// + public int? NumThreads { get; set; } + + /// + /// The number of words to summarize the topic + /// + public int NumSummaryTermPerTopic { get; set; } = 10; + + /// + /// The number of burn-in iterations + /// + [TlcModule.SweepableDiscreteParamAttribute("NumBurninIterations", new object[]{10, 20, 30, 40})] + public int NumBurninIterations { get; set; } = 10; + + /// + /// Reset the random number generator for each document + /// + public bool ResetRandomGenerator { get; set; } = false; + + /// + /// Whether to output the topic-word summary in text format + /// + public bool OutputTopicWordSummary { get; set; } = false; /// /// Input dataset @@ -11440,18 +11435,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(MissingValueHandler)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LightLda)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new MissingValueHandlerPipelineStep(output); + return new LightLdaPipelineStep(output); } - private class MissingValueHandlerPipelineStep : ILearningPipelineDataStep + private class LightLdaPipelineStep : ILearningPipelineDataStep { - public MissingValueHandlerPipelineStep(Output output) + public LightLdaPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11466,8 +11461,13 @@ public MissingValueHandlerPipelineStep(Output output) namespace Transforms { - public sealed partial class NAIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NormalizeTransformLogNormalColumn : OneToOneColumn, IOneToOneColumn { + /// + /// Max number of examples used to train the normalizer + /// + public long? MaxTrainingExamples { get; set; } + /// /// Name of the new column /// @@ -11481,16 +11481,16 @@ public sealed partial class NAIndicatorTransformColumn : OneToOneColumn - /// Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing. + /// Normalizes the data based on the computed mean and variance of the logarithm of the data. /// - public sealed partial class MissingValueIndicator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LogMeanVarianceNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public MissingValueIndicator() + public LogMeanVarianceNormalizer() { } - public MissingValueIndicator(params string[] inputColumns) + public LogMeanVarianceNormalizer(params string[] inputColumns) { if (inputColumns != null) { @@ -11501,7 +11501,7 @@ public MissingValueIndicator(params string[] inputColumns) } } - public MissingValueIndicator(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public LogMeanVarianceNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -11514,23 +11514,33 @@ public MissingValueIndicator(params (string inputColumn, string outputColumn)[] public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } + /// + /// Whether to use CDF as the output + /// + public bool UseCdf { get; set; } = true; + /// /// New column definition(s) (optional form: name:src) /// - public NAIndicatorTransformColumn[] Column { get; set; } + public NormalizeTransformLogNormalColumn[] Column { get; set; } + + /// + /// Max number of examples used to train the normalizer + /// + public long MaxTrainingExamples { get; set; } = 1000000000; /// /// Input dataset @@ -11559,18 +11569,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(MissingValueIndicator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LogMeanVarianceNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new MissingValueIndicatorPipelineStep(output); + return new LogMeanVarianceNormalizerPipelineStep(output); } - private class MissingValueIndicatorPipelineStep : ILearningPipelineDataStep + private class LogMeanVarianceNormalizerPipelineStep : ILearningPipelineDataStep { - public MissingValueIndicatorPipelineStep(Output output) + public LogMeanVarianceNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11584,9 +11594,27 @@ public MissingValueIndicatorPipelineStep(Output output) namespace Transforms { + public enum LpNormNormalizerTransformNormalizerKind : byte + { + L2Norm = 0, + StdDev = 1, + L1Norm = 2, + LInf = 3 + } - public sealed partial class NADropTransformColumn : OneToOneColumn, IOneToOneColumn + + public sealed partial class LpNormNormalizerTransformColumn : OneToOneColumn, IOneToOneColumn { + /// + /// The norm to use to normalize each sample + /// + public LpNormNormalizerTransformNormalizerKind? NormKind { get; set; } + + /// + /// Subtract mean from each value before normalizing + /// + public bool? SubMean { get; set; } + /// /// Name of the new column /// @@ -11600,16 +11628,16 @@ public sealed partial class NADropTransformColumn : OneToOneColumn - /// Removes NAs from vector columns. + /// Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. /// - public sealed partial class MissingValuesDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class LpNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public MissingValuesDropper() + public LpNormalizer() { } - public MissingValuesDropper(params string[] inputColumns) + public LpNormalizer(params string[] inputColumns) { if (inputColumns != null) { @@ -11620,7 +11648,7 @@ public MissingValuesDropper(params string[] inputColumns) } } - public MissingValuesDropper(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public LpNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -11633,23 +11661,33 @@ public MissingValuesDropper(params (string inputColumn, string outputColumn)[] i public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// Columns to drop the NAs for + /// New column definition(s) (optional form: name:src) /// - public NADropTransformColumn[] Column { get; set; } + public LpNormNormalizerTransformColumn[] Column { get; set; } + + /// + /// The norm to use to normalize each sample + /// + public LpNormNormalizerTransformNormalizerKind NormKind { get; set; } = LpNormNormalizerTransformNormalizerKind.L2Norm; + + /// + /// Subtract mean from each value before normalizing + /// + public bool SubMean { get; set; } = false; /// /// Input dataset @@ -11678,18 +11716,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(MissingValuesDropper)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(LpNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new MissingValuesDropperPipelineStep(output); + return new LpNormalizerPipelineStep(output); } - private class MissingValuesDropperPipelineStep : ILearningPipelineDataStep + private class LpNormalizerPipelineStep : ILearningPipelineDataStep { - public MissingValuesDropperPipelineStep(Output output) + public LpNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11705,124 +11743,48 @@ namespace Transforms { /// - /// Filters out rows that contain missing values. + /// Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel. /// - public sealed partial class MissingValuesRowDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ManyHeterogeneousModelCombiner { /// - /// Column - /// - public string[] Column { get; set; } - - /// - /// If true, keep only rows that contain NA values, and filter the rest. + /// Transform model /// - public bool Complement { get; set; } = false; + public ArrayVar TransformModels { get; set; } = new ArrayVar(); /// - /// Input dataset + /// Predictor model /// - public Var Data { get; set; } = new Var(); + public Var PredictorModel { get; set; } = new Var(); - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output { /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); - - /// - /// Transform model + /// Predictor model /// - public Var Model { get; set; } = new Var(); - - } - public Var GetInputData() => Data; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(MissingValuesRowDropper)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - Data = dataStep.Data; - } - Output output = experiment.Add(this); - return new MissingValuesRowDropperPipelineStep(output); - } - - private class MissingValuesRowDropperPipelineStep : ILearningPipelineDataStep - { - public MissingValuesRowDropperPipelineStep(Output output) - { - Data = output.OutputData; - Model = output.Model; - } + public Var PredictorModel { get; set; } = new Var(); - public Var Data { get; } - public Var Model { get; } } } } namespace Transforms { - public enum NAReplaceTransformReplacementKind - { - DefaultValue = 0, - Mean = 1, - Minimum = 2, - Maximum = 3, - SpecifiedValue = 4 - } - - - public sealed partial class NAReplaceTransformColumn : OneToOneColumn, IOneToOneColumn - { - /// - /// Replacement value for NAs (uses default value if not given) - /// - public string ReplacementString { get; set; } - - /// - /// The replacement method to utilize - /// - public NAReplaceTransformReplacementKind? Kind { get; set; } - - /// - /// Whether to impute values by slot - /// - public bool? Slot { get; set; } - - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string Source { get; set; } - - } /// - /// Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only). + /// Normalizes the data based on the computed mean and variance of the data. /// - public sealed partial class MissingValueSubstitutor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class MeanVarianceNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public MissingValueSubstitutor() + public MeanVarianceNormalizer() { } - public MissingValueSubstitutor(params string[] inputColumns) + public MeanVarianceNormalizer(params string[] inputColumns) { if (inputColumns != null) { @@ -11833,7 +11795,7 @@ public MissingValueSubstitutor(params string[] inputColumns) } } - public MissingValueSubstitutor(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public MeanVarianceNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -11846,33 +11808,38 @@ public MissingValueSubstitutor(params (string inputColumn, string outputColumn)[ public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// New column definition(s) (optional form: name:rep:src) + /// Whether to use CDF as the output /// - public NAReplaceTransformColumn[] Column { get; set; } + public bool UseCdf { get; set; } = false; /// - /// The replacement method to utilize + /// New column definition(s) (optional form: name:src) /// - public NAReplaceTransformReplacementKind ReplacementKind { get; set; } = NAReplaceTransformReplacementKind.DefaultValue; + public NormalizeTransformAffineColumn[] Column { get; set; } /// - /// Whether to impute values by slot + /// Whether to map zero to zero, preserving sparsity /// - public bool ImputeBySlot { get; set; } = true; + public bool FixZero { get; set; } = true; + + /// + /// Max number of examples used to train the normalizer + /// + public long MaxTrainingExamples { get; set; } = 1000000000; /// /// Input dataset @@ -11901,18 +11868,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(MissingValueSubstitutor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MeanVarianceNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new MissingValueSubstitutorPipelineStep(output); + return new MeanVarianceNormalizerPipelineStep(output); } - private class MissingValueSubstitutorPipelineStep : ILearningPipelineDataStep + private class MeanVarianceNormalizerPipelineStep : ILearningPipelineDataStep { - public MissingValueSubstitutorPipelineStep(Output output) + public MeanVarianceNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -11928,89 +11895,16 @@ namespace Transforms { /// - /// Combines a sequence of TransformModels into a single model - /// - public sealed partial class ModelCombiner - { - - - /// - /// Input models - /// - public ArrayVar Models { get; set; } = new ArrayVar(); - - - public sealed class Output - { - /// - /// Combined model - /// - public Var OutputModel { get; set; } = new Var(); - - } - } - } - - namespace Transforms - { - public enum NgramTransformWeightingCriteria - { - Tf = 0, - Idf = 1, - TfIdf = 2 - } - - - public sealed partial class NgramTransformColumn : OneToOneColumn, IOneToOneColumn - { - /// - /// Maximum ngram length - /// - public int? NgramLength { get; set; } - - /// - /// Whether to include all ngram lengths up to NgramLength or only NgramLength - /// - public bool? AllLengths { get; set; } - - /// - /// Maximum number of tokens to skip when constructing an ngram - /// - public int? SkipLength { get; set; } - - /// - /// Maximum number of ngrams to store in the dictionary - /// - public int[] MaxNumTerms { get; set; } - - /// - /// Statistical measure used to evaluate how important a word is to a document in a corpus - /// - public NgramTransformWeightingCriteria? Weighting { get; set; } - - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string Source { get; set; } - - } - - /// - /// Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. + /// Normalizes the data based on the observed minimum and maximum values of the data. /// - public sealed partial class NGramTranslator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class MinMaxNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public NGramTranslator() + public MinMaxNormalizer() { } - public NGramTranslator(params string[] inputColumns) + public MinMaxNormalizer(params string[] inputColumns) { if (inputColumns != null) { @@ -12021,7 +11915,7 @@ public NGramTranslator(params string[] inputColumns) } } - public NGramTranslator(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public MinMaxNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -12034,15 +11928,15 @@ public NGramTranslator(params (string inputColumn, string outputColumn)[] inputO public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } @@ -12050,32 +11944,17 @@ public void AddColumn(string outputColumn, string inputColumn) /// /// New column definition(s) (optional form: name:src) /// - public NgramTransformColumn[] Column { get; set; } - - /// - /// Maximum ngram length - /// - public int NgramLength { get; set; } = 2; - - /// - /// Whether to store all ngram lengths up to ngramLength, or only ngramLength - /// - public bool AllLengths { get; set; } = true; - - /// - /// Maximum number of tokens to skip when constructing an ngram - /// - public int SkipLength { get; set; } + public NormalizeTransformAffineColumn[] Column { get; set; } /// - /// Maximum number of ngrams to store in the dictionary + /// Whether to map zero to zero, preserving sparsity /// - public int[] MaxNumTerms { get; set; } = { 10000000 }; + public bool FixZero { get; set; } = true; /// - /// The weighting criteria + /// Max number of examples used to train the normalizer /// - public NgramTransformWeightingCriteria Weighting { get; set; } = NgramTransformWeightingCriteria.Tf; + public long MaxTrainingExamples { get; set; } = 1000000000; /// /// Input dataset @@ -12104,18 +11983,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(NGramTranslator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MinMaxNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new NGramTranslatorPipelineStep(output); + return new MinMaxNormalizerPipelineStep(output); } - private class NGramTranslatorPipelineStep : ILearningPipelineDataStep + private class MinMaxNormalizerPipelineStep : ILearningPipelineDataStep { - public NGramTranslatorPipelineStep(Output output) + public MinMaxNormalizerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12129,78 +12008,110 @@ public NGramTranslatorPipelineStep(Output output) namespace Transforms { - - /// - /// Does nothing. - /// - public sealed partial class NoOperation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public enum NAHandleTransformReplacementKind { + DefaultValue = 0, + Mean = 1, + Minimum = 2, + Maximum = 3 + } + public sealed partial class NAHandleTransformColumn : OneToOneColumn, IOneToOneColumn + { /// - /// Input dataset + /// The replacement method to utilize /// - public Var Data { get; set; } = new Var(); + public NAHandleTransformReplacementKind? Kind { get; set; } + /// + /// Whether to impute values by slot + /// + public bool? ImputeBySlot { get; set; } - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput - { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); + /// + /// Whether or not to concatenate an indicator vector column to the value column + /// + public bool? ConcatIndicator { get; set; } - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + + /// + /// Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric. + /// + public sealed partial class MissingValueHandler : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + public MissingValueHandler() + { } - public Var GetInputData() => Data; - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + public MissingValueHandler(params string[] inputColumns) { - if (previousStep != null) + if (inputColumns != null) { - if (!(previousStep is ILearningPipelineDataStep dataStep)) + foreach (string input in inputColumns) { - throw new InvalidOperationException($"{ nameof(NoOperation)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + AddColumn(input); } - - Data = dataStep.Data; } - Output output = experiment.Add(this); - return new NoOperationPipelineStep(output); } - - private class NoOperationPipelineStep : ILearningPipelineDataStep + + public MissingValueHandler(params (string inputColumn, string outputColumn)[] inputOutputColumns) { - public NoOperationPipelineStep(Output output) + if (inputOutputColumns != null) { - Data = output.OutputData; - Model = output.Model; + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } - public Var Data { get; } - public Var Model { get; } + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); } - } - } - namespace Transforms - { - /// - /// If the source column does not exist after deserialization, create a column with the right type and default values. - /// - public sealed partial class OptionalColumnCreator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// New column definition(s) (optional form: name:rep:src) + /// + public NAHandleTransformColumn[] Column { get; set; } + + /// + /// The replacement method to utilize + /// + public NAHandleTransformReplacementKind ReplaceWith { get; set; } = NAHandleTransformReplacementKind.DefaultValue; + /// + /// Whether to impute values by slot + /// + public bool ImputeBySlot { get; set; } = true; /// - /// New column definition(s) + /// Whether or not to concatenate an indicator vector column to the value column /// - public string[] Column { get; set; } + public bool Concat { get; set; } = true; /// /// Input dataset @@ -12229,18 +12140,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(OptionalColumnCreator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MissingValueHandler)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new OptionalColumnCreatorPipelineStep(output); + return new MissingValueHandlerPipelineStep(output); } - private class OptionalColumnCreatorPipelineStep : ILearningPipelineDataStep + private class MissingValueHandlerPipelineStep : ILearningPipelineDataStep { - public OptionalColumnCreatorPipelineStep(Output output) + public MissingValueHandlerPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12255,33 +12166,8 @@ public OptionalColumnCreatorPipelineStep(Output output) namespace Transforms { - public sealed partial class PcaTransformColumn : OneToOneColumn, IOneToOneColumn + public sealed partial class NAIndicatorTransformColumn : OneToOneColumn, IOneToOneColumn { - /// - /// The name of the weight column - /// - public string WeightColumn { get; set; } - - /// - /// The number of components in the PCA - /// - public int? Rank { get; set; } - - /// - /// Oversampling parameter for randomized PCA training - /// - public int? Oversampling { get; set; } - - /// - /// If enabled, data is centered to be zero mean - /// - public bool? Center { get; set; } - - /// - /// The seed for random number generation - /// - public int? Seed { get; set; } - /// /// Name of the new column /// @@ -12295,16 +12181,16 @@ public sealed partial class PcaTransformColumn : OneToOneColumn - /// Train an PCA Anomaly model. + /// Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing. /// - public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class MissingValueIndicator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public PcaCalculator() + public MissingValueIndicator() { } - public PcaCalculator(params string[] inputColumns) + public MissingValueIndicator(params string[] inputColumns) { if (inputColumns != null) { @@ -12315,7 +12201,7 @@ public PcaCalculator(params string[] inputColumns) } } - public PcaCalculator(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public MissingValueIndicator(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -12328,15 +12214,15 @@ public PcaCalculator(params (string inputColumn, string outputColumn)[] inputOut public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } @@ -12344,32 +12230,7 @@ public void AddColumn(string outputColumn, string inputColumn) /// /// New column definition(s) (optional form: name:src) /// - public PcaTransformColumn[] Column { get; set; } - - /// - /// The name of the weight column - /// - public string WeightColumn { get; set; } - - /// - /// The number of components in the PCA - /// - public int Rank { get; set; } = 20; - - /// - /// Oversampling parameter for randomized PCA training - /// - public int Oversampling { get; set; } = 20; - - /// - /// If enabled, data is centered to be zero mean - /// - public bool Center { get; set; } = true; - - /// - /// The seed for random number generation - /// - public int Seed { get; set; } + public NAIndicatorTransformColumn[] Column { get; set; } /// /// Input dataset @@ -12398,18 +12259,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(PcaCalculator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MissingValueIndicator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new PcaCalculatorPipelineStep(output); + return new MissingValueIndicatorPipelineStep(output); } - private class PcaCalculatorPipelineStep : ILearningPipelineDataStep + private class MissingValueIndicatorPipelineStep : ILearningPipelineDataStep { - public PcaCalculatorPipelineStep(Output output) + public MissingValueIndicatorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12424,17 +12285,71 @@ public PcaCalculatorPipelineStep(Output output) namespace Transforms { + public sealed partial class NADropTransformColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + /// - /// Transforms a predicted label column to its original values, unless it is of type bool. + /// Removes NAs from vector columns. /// - public sealed partial class PredictedLabelColumnOriginalValueConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class MissingValuesDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { + public MissingValuesDropper() + { + } + + public MissingValuesDropper(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public MissingValuesDropper(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + /// - /// The predicted label column + /// Columns to drop the NAs for /// - public string PredictedLabelColumn { get; set; } + public NADropTransformColumn[] Column { get; set; } /// /// Input dataset @@ -12463,18 +12378,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(PredictedLabelColumnOriginalValueConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MissingValuesDropper)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new PredictedLabelColumnOriginalValueConverterPipelineStep(output); + return new MissingValuesDropperPipelineStep(output); } - private class PredictedLabelColumnOriginalValueConverterPipelineStep : ILearningPipelineDataStep + private class MissingValuesDropperPipelineStep : ILearningPipelineDataStep { - public PredictedLabelColumnOriginalValueConverterPipelineStep(Output output) + public MissingValuesDropperPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12489,46 +12404,22 @@ public PredictedLabelColumnOriginalValueConverterPipelineStep(Output output) namespace Transforms { - public sealed partial class GenerateNumberTransformColumn - { - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Use an auto-incremented integer starting at zero instead of a random number - /// - public bool? UseCounter { get; set; } - - /// - /// The random seed - /// - public uint? Seed { get; set; } - - } - /// - /// Adds a column with a generated number sequence. + /// Filters out rows that contain missing values. /// - public sealed partial class RandomNumberGenerator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class MissingValuesRowDropper : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// New column definition(s) (optional form: name:seed) - /// - public GenerateNumberTransformColumn[] Column { get; set; } - - /// - /// Use an auto-incremented integer starting at zero instead of a random number + /// Column /// - public bool UseCounter { get; set; } = false; + public string[] Column { get; set; } /// - /// The random seed + /// If true, keep only rows that contain NA values, and filter the rest. /// - public uint Seed { get; set; } = 42; + public bool Complement { get; set; } = false; /// /// Input dataset @@ -12557,18 +12448,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(RandomNumberGenerator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MissingValuesRowDropper)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new RandomNumberGeneratorPipelineStep(output); + return new MissingValuesRowDropperPipelineStep(output); } - private class RandomNumberGeneratorPipelineStep : ILearningPipelineDataStep + private class MissingValuesRowDropperPipelineStep : ILearningPipelineDataStep { - public RandomNumberGeneratorPipelineStep(Output output) + public MissingValuesRowDropperPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12582,113 +12473,106 @@ public RandomNumberGeneratorPipelineStep(Output output) namespace Transforms { - - /// - /// Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values. - /// - public sealed partial class RowRangeFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public enum NAReplaceTransformReplacementKind { + DefaultValue = 0, + Mean = 1, + Minimum = 2, + Maximum = 3, + SpecifiedValue = 4 + } + public sealed partial class NAReplaceTransformColumn : OneToOneColumn, IOneToOneColumn + { /// - /// Column - /// - public string Column { get; set; } - - /// - /// Minimum value (0 to 1 for key types) + /// Replacement value for NAs (uses default value if not given) /// - public double? Min { get; set; } + public string ReplacementString { get; set; } /// - /// Maximum value (0 to 1 for key types) + /// The replacement method to utilize /// - public double? Max { get; set; } + public NAReplaceTransformReplacementKind? Kind { get; set; } /// - /// If true, keep the values that fall outside the range. + /// Whether to impute values by slot /// - public bool Complement { get; set; } = false; + public bool? Slot { get; set; } /// - /// If true, include in the range the values that are equal to min. + /// Name of the new column /// - public bool IncludeMin { get; set; } = true; + public string Name { get; set; } /// - /// If true, include in the range the values that are equal to max. + /// Name of the source column /// - public bool? IncludeMax { get; set; } + public string Source { get; set; } - /// - /// Input dataset - /// - public Var Data { get; set; } = new Var(); + } + /// + /// Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only). + /// + public sealed partial class MissingValueSubstitutor : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public MissingValueSubstitutor() { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); - - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); - } - public Var GetInputData() => Data; - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + public MissingValueSubstitutor(params string[] inputColumns) { - if (previousStep != null) + if (inputColumns != null) { - if (!(previousStep is ILearningPipelineDataStep dataStep)) + foreach (string input in inputColumns) { - throw new InvalidOperationException($"{ nameof(RowRangeFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + AddColumn(input); } - - Data = dataStep.Data; } - Output output = experiment.Add(this); - return new RowRangeFilterPipelineStep(output); } - - private class RowRangeFilterPipelineStep : ILearningPipelineDataStep + + public MissingValueSubstitutor(params (string inputColumn, string outputColumn)[] inputOutputColumns) { - public RowRangeFilterPipelineStep(Output output) + if (inputOutputColumns != null) { - Data = output.OutputData; - Model = output.Model; + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } } - - public Var Data { get; } - public Var Model { get; } } - } - } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } - namespace Transforms - { + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } - /// - /// Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging. - /// - public sealed partial class RowSkipAndTakeFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// New column definition(s) (optional form: name:rep:src) + /// + public NAReplaceTransformColumn[] Column { get; set; } /// - /// Number of items to skip + /// The replacement method to utilize /// - public long? Skip { get; set; } + public NAReplaceTransformReplacementKind ReplacementKind { get; set; } = NAReplaceTransformReplacementKind.DefaultValue; /// - /// Number of items to take + /// Whether to impute values by slot /// - public long? Take { get; set; } + public bool ImputeBySlot { get; set; } = true; /// /// Input dataset @@ -12717,18 +12601,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(RowSkipAndTakeFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(MissingValueSubstitutor)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new RowSkipAndTakeFilterPipelineStep(output); + return new MissingValueSubstitutorPipelineStep(output); } - private class RowSkipAndTakeFilterPipelineStep : ILearningPipelineDataStep + private class MissingValueSubstitutorPipelineStep : ILearningPipelineDataStep { - public RowSkipAndTakeFilterPipelineStep(Output output) + public MissingValueSubstitutorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12744,146 +12628,154 @@ namespace Transforms { /// - /// Allows limiting input to a subset of rows by skipping a number of rows. + /// Combines a sequence of TransformModels into a single model /// - public sealed partial class RowSkipFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class ModelCombiner { /// - /// Number of items to skip - /// - public long Count { get; set; } - - /// - /// Input dataset + /// Input models /// - public Var Data { get; set; } = new Var(); + public ArrayVar Models { get; set; } = new ArrayVar(); - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + public sealed class Output { /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); - - /// - /// Transform model + /// Combined model /// - public Var Model { get; set; } = new Var(); - - } - public Var GetInputData() => Data; - - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) - { - if (previousStep != null) - { - if (!(previousStep is ILearningPipelineDataStep dataStep)) - { - throw new InvalidOperationException($"{ nameof(RowSkipFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); - } - - Data = dataStep.Data; - } - Output output = experiment.Add(this); - return new RowSkipFilterPipelineStep(output); - } - - private class RowSkipFilterPipelineStep : ILearningPipelineDataStep - { - public RowSkipFilterPipelineStep(Output output) - { - Data = output.OutputData; - Model = output.Model; - } + public Var OutputModel { get; set; } = new Var(); - public Var Data { get; } - public Var Model { get; } } } } namespace Transforms { - - /// - /// Allows limiting input to a subset of rows by taking N first rows. - /// - public sealed partial class RowTakeFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public enum NgramTransformWeightingCriteria { + Tf = 0, + Idf = 1, + TfIdf = 2 + } + public sealed partial class NgramTransformColumn : OneToOneColumn, IOneToOneColumn + { /// - /// Number of items to take + /// Maximum ngram length /// - public long Count { get; set; } = 9223372036854775807; + public int? NgramLength { get; set; } /// - /// Input dataset + /// Whether to include all ngram lengths up to NgramLength or only NgramLength /// - public Var Data { get; set; } = new Var(); - + public bool? AllLengths { get; set; } - public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput - { - /// - /// Transformed dataset - /// - public Var OutputData { get; set; } = new Var(); + /// + /// Maximum number of tokens to skip when constructing an ngram + /// + public int? SkipLength { get; set; } - /// - /// Transform model - /// - public Var Model { get; set; } = new Var(); + /// + /// Maximum number of ngrams to store in the dictionary + /// + public int[] MaxNumTerms { get; set; } + + /// + /// Statistical measure used to evaluate how important a word is to a document in a corpus + /// + public NgramTransformWeightingCriteria? Weighting { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + } + + /// + /// Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. + /// + public sealed partial class NGramTranslator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public NGramTranslator() + { } - public Var GetInputData() => Data; - public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + public NGramTranslator(params string[] inputColumns) { - if (previousStep != null) + if (inputColumns != null) { - if (!(previousStep is ILearningPipelineDataStep dataStep)) + foreach (string input in inputColumns) { - throw new InvalidOperationException($"{ nameof(RowTakeFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + AddColumn(input); } - - Data = dataStep.Data; } - Output output = experiment.Add(this); - return new RowTakeFilterPipelineStep(output); } - - private class RowTakeFilterPipelineStep : ILearningPipelineDataStep + + public NGramTranslator(params (string inputColumn, string outputColumn)[] inputOutputColumns) { - public RowTakeFilterPipelineStep(Output output) + if (inputOutputColumns != null) { - Data = output.OutputData; - Model = output.Model; + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } - public Var Data { get; } - public Var Model { get; } + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); } - } - } - namespace Transforms - { - /// - /// Selects only the last score columns and the extra columns specified in the arguments. - /// - public sealed partial class ScoreColumnSelector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// New column definition(s) (optional form: name:src) + /// + public NgramTransformColumn[] Column { get; set; } + + /// + /// Maximum ngram length + /// + public int NgramLength { get; set; } = 2; + + /// + /// Whether to store all ngram lengths up to ngramLength, or only ngramLength + /// + public bool AllLengths { get; set; } = true; + /// + /// Maximum number of tokens to skip when constructing an ngram + /// + public int SkipLength { get; set; } /// - /// Extra columns to write + /// Maximum number of ngrams to store in the dictionary /// - public string[] ExtraColumns { get; set; } + public int[] MaxNumTerms { get; set; } = { 10000000 }; + + /// + /// The weighting criteria + /// + public NgramTransformWeightingCriteria Weighting { get; set; } = NgramTransformWeightingCriteria.Tf; /// /// Input dataset @@ -12912,18 +12804,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(ScoreColumnSelector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(NGramTranslator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new ScoreColumnSelectorPipelineStep(output); + return new NGramTranslatorPipelineStep(output); } - private class ScoreColumnSelectorPipelineStep : ILearningPipelineDataStep + private class NGramTranslatorPipelineStep : ILearningPipelineDataStep { - public ScoreColumnSelectorPipelineStep(Output output) + public NGramTranslatorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -12939,61 +12831,12 @@ namespace Transforms { /// - /// Turn the predictor model into a transform model - /// - public sealed partial class Scorer - { - - - /// - /// The predictor model to turn into a transform - /// - public Var PredictorModel { get; set; } = new Var(); - - - public sealed class Output - { - /// - /// The scored dataset - /// - public Var ScoredData { get; set; } = new Var(); - - /// - /// The scoring transform - /// - public Var ScoringTransform { get; set; } = new Var(); - - } - } - } - - namespace Transforms - { - public enum UngroupTransformUngroupMode - { - Inner = 0, - Outer = 1, - First = 2 - } - - - /// - /// Un-groups vector columns into sequences of rows, inverse of Group transform + /// Does nothing. /// - public sealed partial class Segregator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class NoOperation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - /// - /// Columns to unroll, or 'pivot' - /// - public string[] Column { get; set; } - - /// - /// Specifies how to unroll multiple pivot columns of different size. - /// - public UngroupTransformUngroupMode Mode { get; set; } = UngroupTransformUngroupMode.Inner; - /// /// Input dataset /// @@ -13021,18 +12864,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(Segregator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(NoOperation)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new SegregatorPipelineStep(output); + return new NoOperationPipelineStep(output); } - private class SegregatorPipelineStep : ILearningPipelineDataStep + private class NoOperationPipelineStep : ILearningPipelineDataStep { - public SegregatorPipelineStep(Output output) + public NoOperationPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13048,21 +12891,16 @@ namespace Transforms { /// - /// Uses a pretrained sentiment model to score input strings + /// If the source column does not exist after deserialization, create a column with the right type and default values. /// - public sealed partial class SentimentAnalyzer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class OptionalColumnCreator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Name of the source column. - /// - public string Source { get; set; } - - /// - /// Name of the new column. + /// New column definition(s) /// - public string Name { get; set; } + public string[] Column { get; set; } /// /// Input dataset @@ -13091,18 +12929,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(SentimentAnalyzer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(OptionalColumnCreator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new SentimentAnalyzerPipelineStep(output); + return new OptionalColumnCreatorPipelineStep(output); } - private class SentimentAnalyzerPipelineStep : ILearningPipelineDataStep + private class OptionalColumnCreatorPipelineStep : ILearningPipelineDataStep { - public SentimentAnalyzerPipelineStep(Output output) + public OptionalColumnCreatorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13117,17 +12955,56 @@ public SentimentAnalyzerPipelineStep(Output output) namespace Transforms { + public sealed partial class PcaTransformColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// The name of the weight column + /// + public string WeightColumn { get; set; } + + /// + /// The number of components in the PCA + /// + public int? Rank { get; set; } + + /// + /// Oversampling parameter for randomized PCA training + /// + public int? Oversampling { get; set; } + + /// + /// If enabled, data is centered to be zero mean + /// + public bool? Center { get; set; } + + /// + /// The seed for random number generation + /// + public int? Seed { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + /// - /// Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins. + /// Train an PCA Anomaly model. /// - public sealed partial class SupervisedBinNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public SupervisedBinNormalizer() + public PcaCalculator() { } - public SupervisedBinNormalizer(params string[] inputColumns) + public PcaCalculator(params string[] inputColumns) { if (inputColumns != null) { @@ -13138,7 +13015,7 @@ public SupervisedBinNormalizer(params string[] inputColumns) } } - public SupervisedBinNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + public PcaCalculator(params (string inputColumn, string outputColumn)[] inputOutputColumns) { if (inputOutputColumns != null) { @@ -13151,48 +13028,48 @@ public SupervisedBinNormalizer(params (string inputColumn, string outputColumn)[ public void AddColumn(string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); Column = list.ToArray(); } public void AddColumn(string outputColumn, string inputColumn) { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); Column = list.ToArray(); } /// - /// Label column for supervised binning + /// New column definition(s) (optional form: name:src) /// - public string LabelColumn { get; set; } + public PcaTransformColumn[] Column { get; set; } /// - /// Minimum number of examples per bin + /// The name of the weight column /// - public int MinBinSize { get; set; } = 10; + public string WeightColumn { get; set; } /// - /// New column definition(s) (optional form: name:src) + /// The number of components in the PCA /// - public NormalizeTransformBinColumn[] Column { get; set; } + public int Rank { get; set; } = 20; /// - /// Max number of bins, power of 2 recommended + /// Oversampling parameter for randomized PCA training /// - public int NumBins { get; set; } = 1024; + public int Oversampling { get; set; } = 20; /// - /// Whether to map zero to zero, preserving sparsity + /// If enabled, data is centered to be zero mean /// - public bool FixZero { get; set; } = true; + public bool Center { get; set; } = true; /// - /// Max number of examples used to train the normalizer + /// The seed for random number generation /// - public long MaxTrainingExamples { get; set; } = 1000000000; + public int Seed { get; set; } /// /// Input dataset @@ -13221,18 +13098,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(SupervisedBinNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(PcaCalculator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new SupervisedBinNormalizerPipelineStep(output); + return new PcaCalculatorPipelineStep(output); } - private class SupervisedBinNormalizerPipelineStep : ILearningPipelineDataStep + private class PcaCalculatorPipelineStep : ILearningPipelineDataStep { - public SupervisedBinNormalizerPipelineStep(Output output) + public PcaCalculatorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13246,149 +13123,112 @@ public SupervisedBinNormalizerPipelineStep(Output output) namespace Transforms { - public enum TextTransformLanguage - { - English = 1, - French = 2, - German = 3, - Dutch = 4, - Italian = 5, - Spanish = 6, - Japanese = 7 - } - - public enum TextNormalizerTransformCaseNormalizationMode - { - Lower = 0, - Upper = 1, - None = 2 - } - - public enum TextTransformTextNormKind - { - None = 0, - L1 = 1, - L2 = 2, - LInf = 3 - } - - public sealed partial class TextTransformColumn : ManyToOneColumn, IManyToOneColumn + /// + /// Transforms a predicted label column to its original values, unless it is of type bool. + /// + public sealed partial class PredictedLabelColumnOriginalValueConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string[] Source { get; set; } - } - public sealed partial class TermLoaderArguments - { /// - /// List of terms + /// The predicted label column /// - public string[] Term { get; set; } + public string PredictedLabelColumn { get; set; } /// - /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// Input dataset /// - public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + public Var Data { get; set; } = new Var(); - /// - /// Drop unknown terms instead of mapping them to NA term. - /// - public bool DropUnknowns { get; set; } = false; - } + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); - /// - /// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. - /// - public sealed partial class TextFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); - public TextFeaturizer() - { - } - - public TextFeaturizer(string outputColumn, params string[] inputColumns) - { - AddColumn(outputColumn, inputColumns); } + public Var GetInputData() => Data; - public void AddColumn(string name, params string[] source) + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { - Column = ManyToOneColumn.Create(name, source); - } + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(PredictedLabelColumnOriginalValueConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new PredictedLabelColumnOriginalValueConverterPipelineStep(output); + } - /// - /// New column definition (optional form: name:srcs). - /// - public TextTransformColumn Column { get; set; } + private class PredictedLabelColumnOriginalValueConverterPipelineStep : ILearningPipelineDataStep + { + public PredictedLabelColumnOriginalValueConverterPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } - /// - /// Dataset language or 'AutoDetect' to detect language per row. - /// - public TextTransformLanguage Language { get; set; } = TextTransformLanguage.English; + public Var Data { get; } + public Var Model { get; } + } + } + } - /// - /// Stopwords remover. - /// - [JsonConverter(typeof(ComponentSerializer))] - public StopWordsRemover StopWordsRemover { get; set; } + namespace Transforms + { + public sealed partial class GenerateNumberTransformColumn + { /// - /// Casing text using the rules of the invariant culture. + /// Name of the new column /// - public TextNormalizerTransformCaseNormalizationMode TextCase { get; set; } = TextNormalizerTransformCaseNormalizationMode.Lower; + public string Name { get; set; } /// - /// Whether to keep diacritical marks or remove them. + /// Use an auto-incremented integer starting at zero instead of a random number /// - public bool KeepDiacritics { get; set; } = false; + public bool? UseCounter { get; set; } /// - /// Whether to keep punctuation marks or remove them. + /// The random seed /// - public bool KeepPunctuations { get; set; } = true; + public uint? Seed { get; set; } - /// - /// Whether to keep numbers or remove them. - /// - public bool KeepNumbers { get; set; } = true; + } - /// - /// Whether to output the transformed text tokens as an additional column. - /// - public bool OutputTokens { get; set; } = false; + /// + /// Adds a column with a generated number sequence. + /// + public sealed partial class RandomNumberGenerator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { - /// - /// A dictionary of whitelisted terms. - /// - public TermLoaderArguments Dictionary { get; set; } /// - /// Ngram feature extractor to use for words (WordBag/WordHashBag). + /// New column definition(s) (optional form: name:seed) /// - [JsonConverter(typeof(ComponentSerializer))] - public NgramExtractor WordFeatureExtractor { get; set; } = new NGramNgramExtractor(); + public GenerateNumberTransformColumn[] Column { get; set; } /// - /// Ngram feature extractor to use for characters (WordBag/WordHashBag). + /// Use an auto-incremented integer starting at zero instead of a random number /// - [JsonConverter(typeof(ComponentSerializer))] - public NgramExtractor CharFeatureExtractor { get; set; } = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false }; + public bool UseCounter { get; set; } = false; /// - /// Normalize vectors (rows) individually by rescaling them to unit norm. + /// The random seed /// - public TextTransformTextNormKind VectorNormalizer { get; set; } = TextTransformTextNormKind.L2; + public uint Seed { get; set; } = 42; /// /// Input dataset @@ -13417,18 +13257,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(TextFeaturizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(RandomNumberGenerator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new TextFeaturizerPipelineStep(output); + return new RandomNumberGeneratorPipelineStep(output); } - private class TextFeaturizerPipelineStep : ILearningPipelineDataStep + private class RandomNumberGeneratorPipelineStep : ILearningPipelineDataStep { - public TextFeaturizerPipelineStep(Output output) + public RandomNumberGeneratorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13444,76 +13284,41 @@ namespace Transforms { /// - /// Converts input values (words, numbers, etc.) to index in a dictionary. + /// Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values. /// - public sealed partial class TextToKeyConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class RowRangeFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { - public TextToKeyConverter() - { - } - - public TextToKeyConverter(params string[] inputColumns) - { - if (inputColumns != null) - { - foreach (string input in inputColumns) - { - AddColumn(input); - } - } - } - - public TextToKeyConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) - { - if (inputOutputColumns != null) - { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } - } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); - } + /// + /// Column + /// + public string Column { get; set; } /// - /// New column definition(s) (optional form: name:src) + /// Minimum value (0 to 1 for key types) /// - public TermTransformColumn[] Column { get; set; } + public double? Min { get; set; } /// - /// Maximum number of terms to keep per column when auto-training + /// Maximum value (0 to 1 for key types) /// - public int MaxNumTerms { get; set; } = 1000000; + public double? Max { get; set; } /// - /// List of terms + /// If true, keep the values that fall outside the range. /// - public string[] Term { get; set; } + public bool Complement { get; set; } = false; /// - /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// If true, include in the range the values that are equal to min. /// - public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + public bool IncludeMin { get; set; } = true; /// - /// Whether key value metadata should be text, regardless of the actual input type + /// If true, include in the range the values that are equal to max. /// - public bool TextKeyValues { get; set; } = false; + public bool? IncludeMax { get; set; } /// /// Input dataset @@ -13542,18 +13347,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(TextToKeyConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(RowRangeFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new TextToKeyConverterPipelineStep(output); + return new RowRangeFilterPipelineStep(output); } - private class TextToKeyConverterPipelineStep : ILearningPipelineDataStep + private class RowRangeFilterPipelineStep : ILearningPipelineDataStep { - public TextToKeyConverterPipelineStep(Output output) + public RowRangeFilterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13569,40 +13374,68 @@ namespace Transforms { /// - /// Split the dataset into train and test sets + /// Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging. /// - public sealed partial class TrainTestDatasetSplitter + public sealed partial class RowSkipAndTakeFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Input dataset + /// Number of items to skip /// - public Var Data { get; set; } = new Var(); + public long? Skip { get; set; } /// - /// Fraction of training data + /// Number of items to take /// - public float Fraction { get; set; } = 0.8f; + public long? Take { get; set; } /// - /// Stratification column + /// Input dataset /// - public string StratificationColumn { get; set; } + public Var Data { get; set; } = new Var(); - public sealed class Output + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput { /// - /// Training data + /// Transformed dataset /// - public Var TrainData { get; set; } = new Var(); + public Var OutputData { get; set; } = new Var(); /// - /// Testing data + /// Transform model /// - public Var TestData { get; set; } = new Var(); + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(RowSkipAndTakeFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new RowSkipAndTakeFilterPipelineStep(output); + } + + private class RowSkipAndTakeFilterPipelineStep : ILearningPipelineDataStep + { + public RowSkipAndTakeFilterPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + public Var Data { get; } + public Var Model { get; } } } } @@ -13611,26 +13444,16 @@ namespace Transforms { /// - /// Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. + /// Allows limiting input to a subset of rows by skipping a number of rows. /// - public sealed partial class TreeLeafFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IFeaturizerInput, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + public sealed partial class RowSkipFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Output column: The suffix to append to the default column names - /// - public string Suffix { get; set; } - - /// - /// If specified, determines the permutation seed for applying this featurizer to a multiclass problem. - /// - public int LabelPermutationSeed { get; set; } - - /// - /// Trainer to use + /// Number of items to skip /// - public Var PredictorModel { get; set; } = new Var(); + public long Count { get; set; } /// /// Input dataset @@ -13659,18 +13482,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(TreeLeafFeaturizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(RowSkipFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new TreeLeafFeaturizerPipelineStep(output); + return new RowSkipFilterPipelineStep(output); } - private class TreeLeafFeaturizerPipelineStep : ILearningPipelineDataStep + private class RowSkipFilterPipelineStep : ILearningPipelineDataStep { - public TreeLeafFeaturizerPipelineStep(Output output) + public RowSkipFilterPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13686,112 +13509,81 @@ namespace Transforms { /// - /// Combines a TransformModel and a PredictorModel into a single PredictorModel. + /// Allows limiting input to a subset of rows by taking N first rows. /// - public sealed partial class TwoHeterogeneousModelCombiner + public sealed partial class RowTakeFilter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem { /// - /// Transform model + /// Number of items to take /// - public Var TransformModel { get; set; } = new Var(); + public long Count { get; set; } = 9223372036854775807; /// - /// Predictor model + /// Input dataset /// - public Var PredictorModel { get; set; } = new Var(); + public Var Data { get; set; } = new Var(); - public sealed class Output + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput { /// - /// Predictor model + /// Transformed dataset /// - public Var PredictorModel { get; set; } = new Var(); - - } - } - } - - namespace Transforms - { - - public sealed partial class DelimitedTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn - { - /// - /// Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character. - /// - public string TermSeparators { get; set; } - - /// - /// Name of the new column - /// - public string Name { get; set; } - - /// - /// Name of the source column - /// - public string Source { get; set; } - - } + public Var OutputData { get; set; } = new Var(); - /// - /// The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. - /// - public sealed partial class WordTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem - { + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); - public WordTokenizer() - { } + public Var GetInputData() => Data; - public WordTokenizer(params string[] inputColumns) + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) { - if (inputColumns != null) + if (previousStep != null) { - foreach (string input in inputColumns) + if (!(previousStep is ILearningPipelineDataStep dataStep)) { - AddColumn(input); + throw new InvalidOperationException($"{ nameof(RowTakeFilter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } + + Data = dataStep.Data; } + Output output = experiment.Add(this); + return new RowTakeFilterPipelineStep(output); } - - public WordTokenizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + + private class RowTakeFilterPipelineStep : ILearningPipelineDataStep { - if (inputOutputColumns != null) + public RowTakeFilterPipelineStep(Output output) { - foreach (var inputOutput in inputOutputColumns) - { - AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); - } + Data = output.OutputData; + Model = output.Model; } - } - - public void AddColumn(string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(inputColumn)); - Column = list.ToArray(); - } - public void AddColumn(string outputColumn, string inputColumn) - { - var list = Column == null ? new List() : new List(Column); - list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); - Column = list.ToArray(); + public Var Data { get; } + public Var Model { get; } } + } + } + namespace Transforms + { + + /// + /// Selects only the last score columns and the extra columns specified in the arguments. + /// + public sealed partial class ScoreColumnSelector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { - /// - /// New column definition(s) - /// - public DelimitedTokenizeTransformColumn[] Column { get; set; } /// - /// Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character. + /// Extra columns to write /// - public string TermSeparators { get; set; } = "space"; + public string[] ExtraColumns { get; set; } /// /// Input dataset @@ -13820,18 +13612,18 @@ public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Exper { if (!(previousStep is ILearningPipelineDataStep dataStep)) { - throw new InvalidOperationException($"{ nameof(WordTokenizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + throw new InvalidOperationException($"{ nameof(ScoreColumnSelector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); } Data = dataStep.Data; } Output output = experiment.Add(this); - return new WordTokenizerPipelineStep(output); + return new ScoreColumnSelectorPipelineStep(output); } - private class WordTokenizerPipelineStep : ILearningPipelineDataStep + private class ScoreColumnSelectorPipelineStep : ILearningPipelineDataStep { - public WordTokenizerPipelineStep(Output output) + public ScoreColumnSelectorPipelineStep(Output output) { Data = output.OutputData; Model = output.Model; @@ -13843,323 +13635,1667 @@ public WordTokenizerPipelineStep(Output output) } } - namespace Runtime - { - public abstract class AutoMlEngine : ComponentKind {} + namespace Transforms + { + + /// + /// Turn the predictor model into a transform model + /// + public sealed partial class Scorer + { + + + /// + /// The predictor model to turn into a transform + /// + public Var PredictorModel { get; set; } = new Var(); + + + public sealed class Output + { + /// + /// The scored dataset + /// + public Var ScoredData { get; set; } = new Var(); + + /// + /// The scoring transform + /// + public Var ScoringTransform { get; set; } = new Var(); + + } + } + } + + namespace Transforms + { + public enum UngroupTransformUngroupMode + { + Inner = 0, + Outer = 1, + First = 2 + } + + + /// + /// Un-groups vector columns into sequences of rows, inverse of Group transform + /// + public sealed partial class Segregator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Columns to unroll, or 'pivot' + /// + public string[] Column { get; set; } + + /// + /// Specifies how to unroll multiple pivot columns of different size. + /// + public UngroupTransformUngroupMode Mode { get; set; } = UngroupTransformUngroupMode.Inner; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(Segregator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new SegregatorPipelineStep(output); + } + + private class SegregatorPipelineStep : ILearningPipelineDataStep + { + public SegregatorPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + + /// + /// Uses a pretrained sentiment model to score input strings + /// + public sealed partial class SentimentAnalyzer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Name of the source column. + /// + public string Source { get; set; } + + /// + /// Name of the new column. + /// + public string Name { get; set; } + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(SentimentAnalyzer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new SentimentAnalyzerPipelineStep(output); + } + + private class SentimentAnalyzerPipelineStep : ILearningPipelineDataStep + { + public SentimentAnalyzerPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + + /// + /// Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins. + /// + public sealed partial class SupervisedBinNormalizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public SupervisedBinNormalizer() + { + } + + public SupervisedBinNormalizer(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public SupervisedBinNormalizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + + + /// + /// Label column for supervised binning + /// + public string LabelColumn { get; set; } + + /// + /// Minimum number of examples per bin + /// + public int MinBinSize { get; set; } = 10; + + /// + /// New column definition(s) (optional form: name:src) + /// + public NormalizeTransformBinColumn[] Column { get; set; } + + /// + /// Max number of bins, power of 2 recommended + /// + public int NumBins { get; set; } = 1024; + + /// + /// Whether to map zero to zero, preserving sparsity + /// + public bool FixZero { get; set; } = true; + + /// + /// Max number of examples used to train the normalizer + /// + public long MaxTrainingExamples { get; set; } = 1000000000; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(SupervisedBinNormalizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new SupervisedBinNormalizerPipelineStep(output); + } + + private class SupervisedBinNormalizerPipelineStep : ILearningPipelineDataStep + { + public SupervisedBinNormalizerPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + public enum TextTransformLanguage + { + English = 1, + French = 2, + German = 3, + Dutch = 4, + Italian = 5, + Spanish = 6, + Japanese = 7 + } + + public enum TextNormalizerTransformCaseNormalizationMode + { + Lower = 0, + Upper = 1, + None = 2 + } + + public enum TextTransformTextNormKind + { + None = 0, + L1 = 1, + L2 = 2, + LInf = 3 + } + + + public sealed partial class TextTransformColumn : ManyToOneColumn, IManyToOneColumn + { + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string[] Source { get; set; } + + } + + public sealed partial class TermLoaderArguments + { + /// + /// List of terms + /// + public string[] Term { get; set; } + + /// + /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// + public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + + /// + /// Drop unknown terms instead of mapping them to NA term. + /// + public bool DropUnknowns { get; set; } = false; + + } + + /// + /// A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. + /// + public sealed partial class TextFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public TextFeaturizer() + { + } + + public TextFeaturizer(string outputColumn, params string[] inputColumns) + { + AddColumn(outputColumn, inputColumns); + } + + public void AddColumn(string name, params string[] source) + { + Column = ManyToOneColumn.Create(name, source); + } + + + /// + /// New column definition (optional form: name:srcs). + /// + public TextTransformColumn Column { get; set; } + + /// + /// Dataset language or 'AutoDetect' to detect language per row. + /// + public TextTransformLanguage Language { get; set; } = TextTransformLanguage.English; + + /// + /// Stopwords remover. + /// + [JsonConverter(typeof(ComponentSerializer))] + public StopWordsRemover StopWordsRemover { get; set; } + + /// + /// Casing text using the rules of the invariant culture. + /// + public TextNormalizerTransformCaseNormalizationMode TextCase { get; set; } = TextNormalizerTransformCaseNormalizationMode.Lower; + + /// + /// Whether to keep diacritical marks or remove them. + /// + public bool KeepDiacritics { get; set; } = false; + + /// + /// Whether to keep punctuation marks or remove them. + /// + public bool KeepPunctuations { get; set; } = true; + + /// + /// Whether to keep numbers or remove them. + /// + public bool KeepNumbers { get; set; } = true; + + /// + /// Whether to output the transformed text tokens as an additional column. + /// + public bool OutputTokens { get; set; } = false; + + /// + /// A dictionary of whitelisted terms. + /// + public TermLoaderArguments Dictionary { get; set; } + + /// + /// Ngram feature extractor to use for words (WordBag/WordHashBag). + /// + [JsonConverter(typeof(ComponentSerializer))] + public NgramExtractor WordFeatureExtractor { get; set; } = new NGramNgramExtractor(); + + /// + /// Ngram feature extractor to use for characters (WordBag/WordHashBag). + /// + [JsonConverter(typeof(ComponentSerializer))] + public NgramExtractor CharFeatureExtractor { get; set; } = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false }; + + /// + /// Normalize vectors (rows) individually by rescaling them to unit norm. + /// + public TextTransformTextNormKind VectorNormalizer { get; set; } = TextTransformTextNormKind.L2; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(TextFeaturizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new TextFeaturizerPipelineStep(output); + } + + private class TextFeaturizerPipelineStep : ILearningPipelineDataStep + { + public TextFeaturizerPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + + /// + /// Converts input values (words, numbers, etc.) to index in a dictionary. + /// + public sealed partial class TextToKeyConverter : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public TextToKeyConverter() + { + } + + public TextToKeyConverter(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public TextToKeyConverter(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + + + /// + /// New column definition(s) (optional form: name:src) + /// + public TermTransformColumn[] Column { get; set; } + + /// + /// Maximum number of terms to keep per column when auto-training + /// + public int MaxNumTerms { get; set; } = 1000000; + + /// + /// List of terms + /// + public string[] Term { get; set; } + + /// + /// How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a'). + /// + public TermTransformSortOrder Sort { get; set; } = TermTransformSortOrder.Occurrence; + + /// + /// Whether key value metadata should be text, regardless of the actual input type + /// + public bool TextKeyValues { get; set; } = false; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(TextToKeyConverter)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new TextToKeyConverterPipelineStep(output); + } + + private class TextToKeyConverterPipelineStep : ILearningPipelineDataStep + { + public TextToKeyConverterPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + + /// + /// Split the dataset into train and test sets + /// + public sealed partial class TrainTestDatasetSplitter + { + + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + /// + /// Fraction of training data + /// + public float Fraction { get; set; } = 0.8f; + + /// + /// Stratification column + /// + public string StratificationColumn { get; set; } + + + public sealed class Output + { + /// + /// Training data + /// + public Var TrainData { get; set; } = new Var(); + + /// + /// Testing data + /// + public Var TestData { get; set; } = new Var(); + + } + } + } + + namespace Transforms + { + + /// + /// Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. + /// + public sealed partial class TreeLeafFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IFeaturizerInput, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// Output column: The suffix to append to the default column names + /// + public string Suffix { get; set; } + + /// + /// If specified, determines the permutation seed for applying this featurizer to a multiclass problem. + /// + public int LabelPermutationSeed { get; set; } + + /// + /// Trainer to use + /// + public Var PredictorModel { get; set; } = new Var(); + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(TreeLeafFeaturizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new TreeLeafFeaturizerPipelineStep(output); + } + + private class TreeLeafFeaturizerPipelineStep : ILearningPipelineDataStep + { + public TreeLeafFeaturizerPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Transforms + { + + /// + /// Combines a TransformModel and a PredictorModel into a single PredictorModel. + /// + public sealed partial class TwoHeterogeneousModelCombiner + { + + + /// + /// Transform model + /// + public Var TransformModel { get; set; } = new Var(); + + /// + /// Predictor model + /// + public Var PredictorModel { get; set; } = new Var(); + + + public sealed class Output + { + /// + /// Predictor model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + } + } + + namespace Transforms + { + + public sealed partial class DelimitedTokenizeTransformColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character. + /// + public string TermSeparators { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + + /// + /// The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + /// + public sealed partial class WordTokenizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public WordTokenizer() + { + } + + public WordTokenizer(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public WordTokenizer(params (string inputColumn, string outputColumn)[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (var inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.outputColumn, inputOutput.inputColumn); + } + } + } + + public void AddColumn(string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(inputColumn)); + Column = list.ToArray(); + } + + public void AddColumn(string outputColumn, string inputColumn) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(outputColumn, inputColumn)); + Column = list.ToArray(); + } + + + /// + /// New column definition(s) + /// + public DelimitedTokenizeTransformColumn[] Column { get; set; } + + /// + /// Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character. + /// + public string TermSeparators { get; set; } = "space"; + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public Var GetInputData() => Data; + + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (previousStep != null) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(WordTokenizer)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + } + Output output = experiment.Add(this); + return new WordTokenizerPipelineStep(output); + } + + private class WordTokenizerPipelineStep : ILearningPipelineDataStep + { + public WordTokenizerPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + + namespace Runtime + { + public abstract class AutoMlEngine : ComponentKind {} + + + + /// + /// AutoML engine that returns learners with default settings. + /// + public sealed class DefaultsAutoMlEngine : AutoMlEngine + { + internal override string ComponentName => "Defaults"; + } + + + + /// + /// AutoML engine that consists of distinct, hierarchical stages of operation. + /// + public sealed class RocketAutoMlEngine : AutoMlEngine + { + /// + /// Number of learners to retain for second stage. + /// + public int TopKLearners { get; set; } = 2; + + /// + /// Number of trials for retained second stage learners. + /// + public int SecondRoundTrialsPerLearner { get; set; } = 5; + + /// + /// Use random initialization only. + /// + public bool RandomInitialization { get; set; } = false; + + /// + /// Number of initilization pipelines, used for random initialization only. + /// + public int NumInitializationPipelines { get; set; } = 20; + + internal override string ComponentName => "Rocket"; + } + + + + /// + /// AutoML engine using uniform random sampling. + /// + public sealed class UniformRandomAutoMlEngine : AutoMlEngine + { + internal override string ComponentName => "UniformRandom"; + } + + public abstract class AutoMlStateBase : ComponentKind {} + + public enum AutoInferenceAutoMlMlStateArgumentsMetrics + { + Auc = 0, + AccuracyMicro = 1, + AccuracyMacro = 2, + L2 = 3, + F1 = 4, + AuPrc = 5, + TopKAccuracy = 6, + Rms = 7, + LossFn = 8, + RSquared = 9, + LogLoss = 10, + LogLossReduction = 11, + Ndcg = 12, + Dcg = 13, + PositivePrecision = 14, + PositiveRecall = 15, + NegativePrecision = 16, + NegativeRecall = 17, + DrAtK = 18, + DrAtPFpr = 19, + DrAtNumPos = 20, + NumAnomalies = 21, + ThreshAtK = 22, + ThreshAtP = 23, + ThreshAtNumPos = 24, + Nmi = 25, + AvgMinScore = 26, + Dbi = 27 + } + + + + /// + /// State of an AutoML search and search space. + /// + public sealed class AutoMlStateAutoMlStateBase : AutoMlStateBase + { + /// + /// Supported metric for evaluator. + /// + public AutoInferenceAutoMlMlStateArgumentsMetrics Metric { get; set; } = AutoInferenceAutoMlMlStateArgumentsMetrics.Auc; + + /// + /// AutoML engine (pipeline optimizer) that generates next candidates. + /// + [JsonConverter(typeof(ComponentSerializer))] + public AutoMlEngine Engine { get; set; } + + /// + /// Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc. + /// + public Microsoft.ML.Models.MacroUtilsTrainerKinds TrainerKind { get; set; } = Microsoft.ML.Models.MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer; + + /// + /// Arguments for creating terminator, which determines when to stop search. + /// + [JsonConverter(typeof(ComponentSerializer))] + public SearchTerminator TerminatorArgs { get; set; } + + /// + /// Learner set to sweep over (if available). + /// + public string[] RequestedLearners { get; set; } + + internal override string ComponentName => "AutoMlState"; + } + + public abstract class CalibratorTrainer : ComponentKind {} + + + + public sealed class FixedPlattCalibratorCalibratorTrainer : CalibratorTrainer + { + /// + /// The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset) + /// + public double Slope { get; set; } = 1d; + + /// + /// The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset) + /// + public double Offset { get; set; } + + internal override string ComponentName => "FixedPlattCalibrator"; + } + + + + public sealed class NaiveCalibratorCalibratorTrainer : CalibratorTrainer + { + internal override string ComponentName => "NaiveCalibrator"; + } + + + + public sealed class PavCalibratorCalibratorTrainer : CalibratorTrainer + { + internal override string ComponentName => "PavCalibrator"; + } + + + + /// + /// Platt calibration. + /// + public sealed class PlattCalibratorCalibratorTrainer : CalibratorTrainer + { + internal override string ComponentName => "PlattCalibrator"; + } + + public abstract class ClassificationLossFunction : ComponentKind {} + + + + /// + /// Exponential loss. + /// + public sealed class ExpLossClassificationLossFunction : ClassificationLossFunction + { + /// + /// Beta (dilation) + /// + public float Beta { get; set; } = 1f; + + internal override string ComponentName => "ExpLoss"; + } + + + + /// + /// Hinge loss. + /// + public sealed class HingeLossClassificationLossFunction : ClassificationLossFunction + { + /// + /// Margin value + /// + public float Margin { get; set; } = 1f; + + internal override string ComponentName => "HingeLoss"; + } + + + + /// + /// Log loss. + /// + public sealed class LogLossClassificationLossFunction : ClassificationLossFunction + { + internal override string ComponentName => "LogLoss"; + } + + + + /// + /// Smoothed Hinge loss. + /// + public sealed class SmoothedHingeLossClassificationLossFunction : ClassificationLossFunction + { + /// + /// Smoothing constant + /// + public float SmoothingConst { get; set; } = 1f; + + internal override string ComponentName => "SmoothedHingeLoss"; + } + + public abstract class EarlyStoppingCriterion : ComponentKind {} + + + + /// + /// Stop in case of loss of generality. + /// + public sealed class GLEarlyStoppingCriterion : EarlyStoppingCriterion + { + /// + /// Threshold in range [0,1]. + /// + [TlcModule.Range(Min = 0f, Max = 1f)] + public float Threshold { get; set; } = 0.01f; + + internal override string ComponentName => "GL"; + } + + + + /// + /// Stops in case of low progress. + /// + public sealed class LPEarlyStoppingCriterion : EarlyStoppingCriterion + { + /// + /// Threshold in range [0,1]. + /// + [TlcModule.Range(Min = 0f, Max = 1f)] + public float Threshold { get; set; } = 0.01f; + + /// + /// The window size. + /// + [TlcModule.Range(Inf = 0)] + public int WindowSize { get; set; } = 5; + + internal override string ComponentName => "LP"; + } + + + + /// + /// Stops in case of generality to progress ration exceeds threshold. + /// + public sealed class PQEarlyStoppingCriterion : EarlyStoppingCriterion + { + /// + /// Threshold in range [0,1]. + /// + [TlcModule.Range(Min = 0f, Max = 1f)] + public float Threshold { get; set; } = 0.01f; + + /// + /// The window size. + /// + [TlcModule.Range(Inf = 0)] + public int WindowSize { get; set; } = 5; + + internal override string ComponentName => "PQ"; + } + + + + /// + /// Stop if validation score exceeds threshold value. + /// + public sealed class TREarlyStoppingCriterion : EarlyStoppingCriterion + { + /// + /// Tolerance threshold. (Non negative value) + /// + [TlcModule.Range(Min = 0f)] + public float Threshold { get; set; } = 0.01f; + + internal override string ComponentName => "TR"; + } + + + + /// + /// Stops in case of consecutive loss in generality. + /// + public sealed class UPEarlyStoppingCriterion : EarlyStoppingCriterion + { + /// + /// The window size. + /// + [TlcModule.Range(Inf = 0)] + public int WindowSize { get; set; } = 5; + + internal override string ComponentName => "UP"; + } + + public abstract class EnsembleBinaryOutputCombiner : ComponentKind {} + + + + public sealed class AverageEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner + { + internal override string ComponentName => "Average"; + } + + + + public sealed class MedianEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner + { + internal override string ComponentName => "Median"; + } + + + + public sealed class StackingEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner + { + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "Stacking"; + } + + + + public sealed class VotingEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner + { + internal override string ComponentName => "Voting"; + } + + public enum WeightageKind + { + Accuracy = 0, + Auc = 1, + PosPrecision = 2, + PosRecall = 3, + NegPrecision = 4, + NegRecall = 5 + } + + + + public sealed class WeightedAverageEnsembleBinaryOutputCombiner : EnsembleBinaryOutputCombiner + { + /// + /// The metric type to be used to find the weights for each model + /// + public WeightageKind WeightageName { get; set; } = WeightageKind.Auc; + + internal override string ComponentName => "WeightedAverage"; + } + + public abstract class EnsembleBinarySubModelSelector : ComponentKind {} + + + + public sealed class AllSelectorEnsembleBinarySubModelSelector : EnsembleBinarySubModelSelector + { + internal override string ComponentName => "AllSelector"; + } + + + + public sealed class BestDiverseSelectorEnsembleBinarySubModelSelector : EnsembleBinarySubModelSelector + { + /// + /// The metric type to be used to find the diversity among base learners + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } + + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + internal override string ComponentName => "BestDiverseSelector"; + } - /// - /// AutoML engine that returns learners with default settings. - /// - public sealed class DefaultsAutoMlEngine : AutoMlEngine + public enum BinaryClassifierEvaluatorMetrics { - internal override string ComponentName => "Defaults"; + Accuracy = 0, + PosPrecName = 1, + PosRecallName = 2, + NegPrecName = 3, + NegRecallName = 4, + Auc = 5, + LogLoss = 6, + LogLossReduction = 7, + F1 = 8, + AuPrc = 9 } - /// - /// AutoML engine that consists of distinct, hierarchical stages of operation. - /// - public sealed class RocketAutoMlEngine : AutoMlEngine + public sealed class BestPerformanceSelectorEnsembleBinarySubModelSelector : EnsembleBinarySubModelSelector { /// - /// Number of learners to retain for second stage. + /// The metric type to be used to find the best performance /// - public int TopKLearners { get; set; } = 2; + public BinaryClassifierEvaluatorMetrics MetricName { get; set; } = BinaryClassifierEvaluatorMetrics.Auc; /// - /// Number of trials for retained second stage learners. + /// The proportion of best base learners to be selected. The range is 0.0-1.0 /// - public int SecondRoundTrialsPerLearner { get; set; } = 5; + public float LearnersSelectionProportion { get; set; } = 0.5f; /// - /// Use random initialization only. + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// - public bool RandomInitialization { get; set; } = false; + public float ValidationDatasetProportion { get; set; } = 0.3f; - /// - /// Number of initilization pipelines, used for random initialization only. - /// - public int NumInitializationPipelines { get; set; } = 20; + internal override string ComponentName => "BestPerformanceSelector"; + } - internal override string ComponentName => "Rocket"; + public abstract class EnsembleDiversityMeasure : ComponentKind {} + + + + public sealed class DisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure + { + internal override string ComponentName => "DisagreementDiversityMeasure"; } - /// - /// AutoML engine using uniform random sampling. - /// - public sealed class UniformRandomAutoMlEngine : AutoMlEngine + public sealed class MultiDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure { - internal override string ComponentName => "UniformRandom"; + internal override string ComponentName => "MultiDisagreementDiversityMeasure"; } - public abstract class AutoMlStateBase : ComponentKind {} - public enum AutoInferenceAutoMlMlStateArgumentsMetrics + + public sealed class RegressionDisagreementDiversityMeasureEnsembleDiversityMeasure : EnsembleDiversityMeasure { - Auc = 0, - AccuracyMicro = 1, - AccuracyMacro = 2, - L2 = 3, - F1 = 4, - AuPrc = 5, - TopKAccuracy = 6, - Rms = 7, - LossFn = 8, - RSquared = 9, - LogLoss = 10, - LogLossReduction = 11, - Ndcg = 12, - Dcg = 13, - PositivePrecision = 14, - PositiveRecall = 15, - NegativePrecision = 16, - NegativeRecall = 17, - DrAtK = 18, - DrAtPFpr = 19, - DrAtNumPos = 20, - NumAnomalies = 21, - ThreshAtK = 22, - ThreshAtP = 23, - ThreshAtNumPos = 24, - Nmi = 25, - AvgMinScore = 26, - Dbi = 27 + internal override string ComponentName => "RegressionDisagreementDiversityMeasure"; } + public abstract class EnsembleFeatureSelector : ComponentKind {} - /// - /// State of an AutoML search and search space. - /// - public sealed class AutoMlStateAutoMlStateBase : AutoMlStateBase + + public sealed class AllFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector + { + internal override string ComponentName => "AllFeatureSelector"; + } + + + + public sealed class RandomFeatureSelectorEnsembleFeatureSelector : EnsembleFeatureSelector { /// - /// Supported metric for evaluator. + /// The proportion of features to be selected. The range is 0.0-1.0 /// - public AutoInferenceAutoMlMlStateArgumentsMetrics Metric { get; set; } = AutoInferenceAutoMlMlStateArgumentsMetrics.Auc; + public float FeaturesSelectionProportion { get; set; } = 0.8f; + + internal override string ComponentName => "RandomFeatureSelector"; + } + + public abstract class EnsembleMulticlassOutputCombiner : ComponentKind {} + + + public sealed class MultiAverageEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner + { /// - /// AutoML engine (pipeline optimizer) that generates next candidates. + /// Whether to normalize the output of base models before combining them /// - [JsonConverter(typeof(ComponentSerializer))] - public AutoMlEngine Engine { get; set; } + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiAverage"; + } + + + public sealed class MultiMedianEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner + { /// - /// Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc. + /// Whether to normalize the output of base models before combining them /// - public Microsoft.ML.Models.MacroUtilsTrainerKinds TrainerKind { get; set; } = Microsoft.ML.Models.MacroUtilsTrainerKinds.SignatureBinaryClassifierTrainer; + public bool Normalize { get; set; } = true; + + internal override string ComponentName => "MultiMedian"; + } + + + public sealed class MultiStackingEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner + { /// - /// Arguments for creating terminator, which determines when to stop search. + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// - [JsonConverter(typeof(ComponentSerializer))] - public SearchTerminator TerminatorArgs { get; set; } + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "MultiStacking"; + } + + + public sealed class MultiVotingEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner + { /// - /// Learner set to sweep over (if available). + /// Whether to normalize the output of base models before combining them /// - public string[] RequestedLearners { get; set; } + public bool Normalize { get; set; } = true; - internal override string ComponentName => "AutoMlState"; + internal override string ComponentName => "MultiVoting"; } - public abstract class CalibratorTrainer : ComponentKind {} + public enum MultiWeightageKind + { + AccuracyMicroAvg = 0, + AccuracyMacroAvg = 1 + } - public sealed class FixedPlattCalibratorCalibratorTrainer : CalibratorTrainer + public sealed class MultiWeightedAverageEnsembleMulticlassOutputCombiner : EnsembleMulticlassOutputCombiner { /// - /// The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset) + /// The metric type to be used to find the weights for each model /// - public double Slope { get; set; } = 1d; + public MultiWeightageKind WeightageName { get; set; } = MultiWeightageKind.AccuracyMicroAvg; /// - /// The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset) + /// Whether to normalize the output of base models before combining them /// - public double Offset { get; set; } + public bool Normalize { get; set; } = true; - internal override string ComponentName => "FixedPlattCalibrator"; + internal override string ComponentName => "MultiWeightedAverage"; } + public abstract class EnsembleMulticlassSubModelSelector : ComponentKind {} - public sealed class NaiveCalibratorCalibratorTrainer : CalibratorTrainer + + public sealed class AllSelectorMultiClassEnsembleMulticlassSubModelSelector : EnsembleMulticlassSubModelSelector { - internal override string ComponentName => "NaiveCalibrator"; + internal override string ComponentName => "AllSelectorMultiClass"; } - public sealed class PavCalibratorCalibratorTrainer : CalibratorTrainer + public sealed class BestDiverseSelectorMultiClassEnsembleMulticlassSubModelSelector : EnsembleMulticlassSubModelSelector { - internal override string ComponentName => "PavCalibrator"; - } + /// + /// The metric type to be used to find the diversity among base learners + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; - /// - /// Platt calibration. - /// - public sealed class PlattCalibratorCalibratorTrainer : CalibratorTrainer - { - internal override string ComponentName => "PlattCalibrator"; + internal override string ComponentName => "BestDiverseSelectorMultiClass"; } - public abstract class ClassificationLossFunction : ComponentKind {} + public enum MultiClassClassifierEvaluatorMetrics + { + AccuracyMicro = 0, + AccuracyMacro = 1, + LogLoss = 2, + LogLossReduction = 3 + } - /// - /// Exponential loss. - /// - public sealed class ExpLossClassificationLossFunction : ClassificationLossFunction + public sealed class BestPerformanceSelectorMultiClassEnsembleMulticlassSubModelSelector : EnsembleMulticlassSubModelSelector { /// - /// Beta (dilation) + /// The metric type to be used to find the best performance /// - public float Beta { get; set; } = 1f; + public MultiClassClassifierEvaluatorMetrics MetricName { get; set; } = MultiClassClassifierEvaluatorMetrics.AccuracyMicro; - internal override string ComponentName => "ExpLoss"; + /// + /// The proportion of best base learners to be selected. The range is 0.0-1.0 + /// + public float LearnersSelectionProportion { get; set; } = 0.5f; + + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestPerformanceSelectorMultiClass"; } + public abstract class EnsembleRegressionOutputCombiner : ComponentKind {} - /// - /// Hinge loss. - /// - public sealed class HingeLossClassificationLossFunction : ClassificationLossFunction - { - /// - /// Margin value - /// - public float Margin { get; set; } = 1f; - internal override string ComponentName => "HingeLoss"; + public sealed class AverageEnsembleRegressionOutputCombiner : EnsembleRegressionOutputCombiner + { + internal override string ComponentName => "Average"; } - /// - /// Log loss. - /// - public sealed class LogLossClassificationLossFunction : ClassificationLossFunction + public sealed class MedianEnsembleRegressionOutputCombiner : EnsembleRegressionOutputCombiner { - internal override string ComponentName => "LogLoss"; + internal override string ComponentName => "Median"; } - /// - /// Smoothed Hinge loss. - /// - public sealed class SmoothedHingeLossClassificationLossFunction : ClassificationLossFunction + public sealed class RegressionStackingEnsembleRegressionOutputCombiner : EnsembleRegressionOutputCombiner { /// - /// Smoothing constant + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set /// - public float SmoothingConst { get; set; } = 1f; + public float ValidationDatasetProportion { get; set; } = 0.3f; - internal override string ComponentName => "SmoothedHingeLoss"; + internal override string ComponentName => "RegressionStacking"; } - public abstract class EarlyStoppingCriterion : ComponentKind {} + public abstract class EnsembleRegressionSubModelSelector : ComponentKind {} - /// - /// Stop in case of loss of generality. - /// - public sealed class GLEarlyStoppingCriterion : EarlyStoppingCriterion + public sealed class AllSelectorEnsembleRegressionSubModelSelector : EnsembleRegressionSubModelSelector { - /// - /// Threshold in range [0,1]. - /// - [TlcModule.Range(Min = 0f, Max = 1f)] - public float Threshold { get; set; } = 0.01f; - - internal override string ComponentName => "GL"; + internal override string ComponentName => "AllSelector"; } - /// - /// Stops in case of low progress. - /// - public sealed class LPEarlyStoppingCriterion : EarlyStoppingCriterion + public sealed class BestDiverseSelectorRegressionEnsembleRegressionSubModelSelector : EnsembleRegressionSubModelSelector { /// - /// Threshold in range [0,1]. + /// The metric type to be used to find the diversity among base learners /// - [TlcModule.Range(Min = 0f, Max = 1f)] - public float Threshold { get; set; } = 0.01f; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleDiversityMeasure DiversityMetricType { get; set; } /// - /// The window size. + /// The proportion of best base learners to be selected. The range is 0.0-1.0 /// - [TlcModule.Range(Inf = 0)] - public int WindowSize { get; set; } = 5; + public float LearnersSelectionProportion { get; set; } = 0.5f; - internal override string ComponentName => "LP"; + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestDiverseSelectorRegression"; } + public enum RegressionEvaluatorMetrics + { + L1 = 0, + L2 = 1, + Rms = 2, + Loss = 3, + RSquared = 4 + } - /// - /// Stops in case of generality to progress ration exceeds threshold. - /// - public sealed class PQEarlyStoppingCriterion : EarlyStoppingCriterion + + public sealed class BestPerformanceRegressionSelectorEnsembleRegressionSubModelSelector : EnsembleRegressionSubModelSelector { /// - /// Threshold in range [0,1]. + /// The metric type to be used to find the best performance /// - [TlcModule.Range(Min = 0f, Max = 1f)] - public float Threshold { get; set; } = 0.01f; + public RegressionEvaluatorMetrics MetricName { get; set; } = RegressionEvaluatorMetrics.L1; /// - /// The window size. + /// The proportion of best base learners to be selected. The range is 0.0-1.0 /// - [TlcModule.Range(Inf = 0)] - public int WindowSize { get; set; } = 5; + public float LearnersSelectionProportion { get; set; } = 0.5f; - internal override string ComponentName => "PQ"; + /// + /// The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set + /// + public float ValidationDatasetProportion { get; set; } = 0.3f; + + internal override string ComponentName => "BestPerformanceRegressionSelector"; } + public abstract class EnsembleSubsetSelector : ComponentKind {} - /// - /// Stop if validation score exceeds threshold value. - /// - public sealed class TREarlyStoppingCriterion : EarlyStoppingCriterion + + public sealed class AllInstanceSelectorEnsembleSubsetSelector : EnsembleSubsetSelector { /// - /// Tolerance threshold. (Non negative value) + /// The Feature selector /// - [TlcModule.Range(Min = 0f)] - public float Threshold { get; set; } = 0.01f; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleFeatureSelector FeatureSelector { get; set; } = new AllFeatureSelectorEnsembleFeatureSelector(); - internal override string ComponentName => "TR"; + internal override string ComponentName => "AllInstanceSelector"; } - /// - /// Stops in case of consecutive loss in generality. - /// - public sealed class UPEarlyStoppingCriterion : EarlyStoppingCriterion + public sealed class BootstrapSelectorEnsembleSubsetSelector : EnsembleSubsetSelector { /// - /// The window size. + /// The Feature selector /// - [TlcModule.Range(Inf = 0)] - public int WindowSize { get; set; } = 5; + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleFeatureSelector FeatureSelector { get; set; } = new AllFeatureSelectorEnsembleFeatureSelector(); - internal override string ComponentName => "UP"; + internal override string ComponentName => "BootstrapSelector"; + } + + + + public sealed class RandomPartitionSelectorEnsembleSubsetSelector : EnsembleSubsetSelector + { + /// + /// The Feature selector + /// + [JsonConverter(typeof(ComponentSerializer))] + public EnsembleFeatureSelector FeatureSelector { get; set; } = new AllFeatureSelectorEnsembleFeatureSelector(); + + internal override string ComponentName => "RandomPartitionSelector"; } public abstract class FastTreeTrainer : ComponentKind {} diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index b15d04c860..acea2339aa 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -654,6 +654,57 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.AnomalyPipelineEnsemble", + "Desc": "Combine anomaly detection models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Average" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IAnomalyDetectionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.BinaryClassificationEvaluator", "Desc": "Evaluates a binary classification scored dataset.", @@ -967,6 +1018,122 @@ } ] }, + { + "Name": "Models.BinaryEnsemble", + "Desc": "Combine binary classifiers into an ensemble", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + }, + { + "Name": "ValidatePipelines", + "Type": "Bool", + "Desc": "Whether to validate that all the pipelines are identical", + "Aliases": [ + "validate" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Models.BinaryPipelineEnsemble", + "Desc": "Combine binary classification models into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.ClassificationEvaluator", "Desc": "Evaluates a multi class classification scored dataset.", @@ -1694,6 +1861,43 @@ "ITransformInput" ] }, + { + "Name": "Models.EnsembleSummary", + "Desc": "Summarize a pipeline ensemble predictor.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor to summarize", + "Aliases": [ + "predictorModel" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "Summaries", + "Type": { + "Kind": "Array", + "ItemType": "DataView" + }, + "Desc": "The summaries of the individual predictors" + }, + { + "Name": "Stats", + "Type": { + "Kind": "Array", + "ItemType": "DataView" + }, + "Desc": "The model statistics of the individual predictors" + } + ] + }, { "Name": "Models.FixedPlattCalibrator", "Desc": "Apply a Platt calibrator with a fixed slope and offset to an input model", @@ -1776,6 +1980,58 @@ "ITrainerOutput" ] }, + { + "Name": "Models.MultiClassPipelineEnsemble", + "Desc": "Combine multiclass classifiers into an ensemble", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average", + "Vote" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.MultiOutputRegressionEvaluator", "Desc": "Evaluates a multi output regression scored dataset.", @@ -2822,6 +3078,69 @@ "IEvaluatorOutput" ] }, + { + "Name": "Models.RegressionEnsemble", + "Desc": "Combine regression models into an ensemble", + "FriendlyName": "Regression Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + }, + { + "Name": "ValidatePipelines", + "Type": "Bool", + "Desc": "Whether to validate that all the pipelines are identical", + "Aliases": [ + "validate" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, { "Name": "Models.RegressionEvaluator", "Desc": "Evaluates a regression scored dataset.", @@ -2945,13 +3264,64 @@ ] }, { - "Name": "Models.Summarizer", - "Desc": "Summarize a linear regression predictor.", + "Name": "Models.RegressionPipelineEnsemble", + "Desc": "Combine regression models into an ensemble", "FriendlyName": null, "ShortName": null, "Inputs": [ { - "Name": "PredictorModel", + "Name": "Models", + "Type": { + "Kind": "Array", + "ItemType": "PredictorModel" + }, + "Desc": "The models to combine into an ensemble", + "Aliases": [ + "models" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ModelCombiner", + "Type": { + "Kind": "Enum", + "Values": [ + "Median", + "Average" + ] + }, + "Desc": "The combiner used to combine the scores", + "Aliases": [ + "combiner" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Median" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Models.Summarizer", + "Desc": "Summarize a linear regression predictor.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", "Type": "PredictorModel", "Desc": "The predictor to summarize", "Aliases": [ @@ -3695,31 +4065,11 @@ ] }, { - "Name": "Trainers.FastForestBinaryClassifier", - "Desc": "Uses a random forest learner to perform binary classification.", - "FriendlyName": "Fast Forest Classification", - "ShortName": "ff", + "Name": "Trainers.EnsembleBinaryClassifier", + "Desc": "Train binary ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, "Inputs": [ - { - "Name": "NumTrees", - "Type": "Int", - "Desc": "Total number of decision trees to create in the ensemble", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 100, - 500 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -3732,22 +4082,25 @@ "IsNullable": false }, { - "Name": "NumLeaves", - "Type": "Int", - "Desc": "The max number of leaves in each regression tree", + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", "Aliases": [ - "nl" + "st" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Long", - "Min": 2, - "Max": 128, - "StepSize": 4.0, - "IsLogScale": true + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { @@ -3763,24 +4116,16 @@ "Default": "Features" }, { - "Name": "MinDocumentsInLeafs", + "Name": "NumModels", "Type": "Int", - "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "mil" + "nm" ], "Required": false, "SortOrder": 3.0, - "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "IsNullable": true, + "Default": null }, { "Name": "LabelColumn", @@ -3795,28 +4140,38 @@ "Default": "Label" }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleBinarySubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "weight" + "pt" ], "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": "Weight" + "Default": { + "Name": "AllSelector" + } }, { - "Name": "GroupIdColumn", - "Type": "String", - "Desc": "Column to use for example groupId", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleBinaryOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "groupId" + "oc" ], "Required": false, "SortOrder": 5.0, "IsNullable": false, - "Default": "GroupId" + "Default": { + "Name": "Median" + } }, { "Name": "NormalizeFeatures", @@ -3859,472 +4214,444 @@ "Default": "Auto" }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "TrainParallel", + "Type": "Bool", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "mo" + "tp" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100.0 - }, - { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", - "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": false }, { - "Name": "MaxCalibrationExamples", + "Name": "BatchSize", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "Batch size", + "Aliases": [ + "bs" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 1000000 + "Default": -1 }, { - "Name": "QuantileSampleCount", - "Type": "Int", - "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "qsc" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": 100 + "Default": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.EnsembleClassification", + "Desc": "Train multiclass ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "ParallelTrainer", + "Name": "SamplingType", "Type": { "Kind": "Component", - "ComponentKind": "ParallelTraining" + "ComponentKind": "EnsembleSubsetSelector" }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Desc": "Sampling Type", "Aliases": [ - "parag" + "st" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, "Default": { - "Name": "Single" + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } } }, { - "Name": "NumThreads", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "NumModels", "Type": "Int", - "Desc": "The number of threads to use", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "t" + "nm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": true, "Default": null }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "r1" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 123 + "Default": "Label" }, { - "Name": "FeatureSelectSeed", - "Type": "Int", - "Desc": "The seed of the active feature selection", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleMulticlassSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "r3" + "pt" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": 123 + "Default": { + "Name": "AllSelectorMultiClass" + } }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleMulticlassOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "e" + "oc" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "MultiMedian" + } }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "ps" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": -1 - }, - { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", - "Aliases": [ - "dt" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "Default": "Auto" }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "flocks" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": true + "Default": "Auto" }, { - "Name": "CategoricalSplit", + "Name": "TrainParallel", "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "cat" + "tp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, "Default": false }, { - "Name": "MaxCategoricalGroupsPerNode", + "Name": "BatchSize", "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", + "Desc": "Batch size", "Aliases": [ - "mcg" + "bs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 64 + "Default": -1 }, { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", + "Name": "ShowMetrics", + "Type": "Bool", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "maxcat" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, - "Default": 64 - }, + "Default": false + } + ], + "Outputs": [ { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", - "Aliases": [ - "mdop" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.001 - }, + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.EnsembleRegression", + "Desc": "Train regression ensemble.", + "FriendlyName": "Parallel Ensemble (bagging, stacking, etc)", + "ShortName": null, + "Inputs": [ { - "Name": "MinDocsForCategoricalSplit", - "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "mdo" + "data" ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100 + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "Bias", - "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", + "Name": "SamplingType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleSubsetSelector" + }, + "Desc": "Sampling Type", "Aliases": [ - "bias" + "st" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "BootstrapSelector", + "Settings": { + "FeatureSelector": { + "Name": "AllFeatureSelector" + } + } + } }, { - "Name": "Bundling", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" - ] - }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "bundle" + "feat" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": "None" + "Default": "Features" }, { - "Name": "MaxBins", + "Name": "NumModels", "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Desc": "Number of models per batch. If not specified, will default to 50 if there is only one base predictor, or the number of base predictors otherwise.", "Aliases": [ - "mb" + "nm" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 255 + "SortOrder": 3.0, + "IsNullable": true, + "Default": null }, { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "sp" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 0.7 + "Default": "Label" }, { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", + "Name": "SubModelSelectorType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionSubModelSelector" + }, + "Desc": "Algorithm to prune the base learners for selective Ensemble", "Aliases": [ - "ffup" + "pt" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "AllSelector" + } }, { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", + "Name": "OutputCombiner", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleRegressionOutputCombiner" + }, + "Desc": "Output combiner", "Aliases": [ - "frup" + "oc" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.0 + "Default": { + "Name": "Median" + } }, { - "Name": "GainConfidenceLevel", - "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "gainconf" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 0.0 + "Default": "Auto" }, { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "smtemp" + "cache" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 0.0 + "Default": "Auto" }, { - "Name": "ExecutionTimes", + "Name": "TrainParallel", "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", + "Desc": "All the base learners will run asynchronously if the value is true", "Aliases": [ - "et" + "tp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, "Default": false }, { - "Name": "FeatureFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", "Aliases": [ - "ff" + "bs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 107.0, "IsNullable": false, - "Default": 0.7 + "Default": -1 }, { - "Name": "BaggingSize", - "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", - "Aliases": [ - "bag" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1 - }, - { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", - "Aliases": [ - "bagfrac" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "SplitFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", - "Aliases": [ - "sf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", - "Aliases": [ - "s" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "AllowEmptyTrees", - "Type": "Bool", - "Desc": "When a root split is impossible, allow training to proceed", - "Aliases": [ - "allowempty", - "dummies" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "FeatureCompressionLevel", - "Type": "Int", - "Desc": "The level of feature compression to use", - "Aliases": [ - "fcomp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1 - }, - { - "Name": "CompressEnsemble", - "Type": "Bool", - "Desc": "Compress the tree Ensemble", - "Aliases": [ - "cmp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MaxTreesAfterCompression", - "Type": "Int", - "Desc": "Maximum Number of trees after compression", - "Aliases": [ - "cmpmax" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": -1 - }, - { - "Name": "PrintTestGraph", + "Name": "ShowMetrics", "Type": "Bool", - "Desc": "Print metrics graph for the first test set", - "Aliases": [ - "graph" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "PrintTrainValidGraph", - "Type": "Bool", - "Desc": "Print Train and Validation metrics in graph", + "Desc": "True, if metrics for each model need to be evaluated and shown in comparison table. This is done by using validation set if available or the training set", "Aliases": [ - "graphtv" + "sm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 108.0, "IsNullable": false, "Default": false - }, - { - "Name": "TestFrequency", - "Type": "Int", - "Desc": "Calculate metric values for train/valid/test every k rounds", - "Aliases": [ - "tf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 2147483647 } ], "Outputs": [ @@ -4335,21 +4662,19 @@ } ], "InputKind": [ - "ITrainerInputWithGroupId", - "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastForestRegressor", - "Desc": "Trains a random forest to fit target values using least-squares.", - "FriendlyName": "FastForestRegression", - "ShortName": "ffr", + "Name": "Trainers.FastForestBinaryClassifier", + "Desc": "Uses a random forest learner to perform binary classification.", + "FriendlyName": "Fast Forest Classification", + "ShortName": "ff", "Inputs": [ { "Name": "NumTrees", @@ -4510,13 +4835,39 @@ "Default": "Auto" }, { - "Name": "ShuffleLabels", - "Type": "Bool", - "Desc": "Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass.", + "Name": "MaxTreeOutput", + "Type": "Float", + "Desc": "Upper bound on absolute value of single tree output", + "Aliases": [ + "mo" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 100.0 + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 }, { "Name": "QuantileSampleCount", @@ -4966,15 +5317,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeBinaryClassifier", - "Desc": "Uses a logit-boost boosted tree learner to perform binary classification.", - "FriendlyName": "FastTree (Boosted Trees) Classification", - "ShortName": "ftc", + "Name": "Trainers.FastForestRegressor", + "Desc": "Trains a random forest to fit target values using least-squares.", + "FriendlyName": "FastForestRegression", + "ShortName": "ffr", "Inputs": [ { "Name": "NumTrees", @@ -5070,24 +5421,6 @@ "IsNullable": false, "Default": "Label" }, - { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", - "Aliases": [ - "lr" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": 0.2, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 0.4, - "IsLogScale": true - } - }, { "Name": "WeightColumn", "Type": "String", @@ -5153,623 +5486,354 @@ "Default": "Auto" }, { - "Name": "UnbalancedSets", + "Name": "ShuffleLabels", "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", - "Aliases": [ - "us" - ], + "Desc": "Shuffle the labels on every iteration. Useful probably only if using this tree as a tree leaf featurizer for multiclass.", "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": false }, { - "Name": "BestStepRankingRegressionTrees", - "Type": "Bool", - "Desc": "Use best regression step trees?", + "Name": "QuantileSampleCount", + "Type": "Int", + "Desc": "Number of labels to be sampled from each leaf to make the distribtuion", "Aliases": [ - "bsr" + "qsc" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 100 }, { - "Name": "UseLineSearch", - "Type": "Bool", - "Desc": "Should we use line search for a step size", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "ls" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": { + "Name": "Single" + } }, { - "Name": "NumPostBracketSteps", + "Name": "NumThreads", "Type": "Int", - "Desc": "Number of post-bracket line search steps", + "Desc": "The number of threads to use", "Aliases": [ - "lssteps" + "t" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 + "IsNullable": true, + "Default": null }, { - "Name": "MinStepSize", - "Type": "Float", - "Desc": "Minimum line search step size", + "Name": "RngSeed", + "Type": "Int", + "Desc": "The seed of the random number generator", "Aliases": [ - "minstep" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 123 }, { - "Name": "OptimizationAlgorithm", - "Type": { - "Kind": "Enum", - "Values": [ - "GradientDescent", - "AcceleratedGradientDescent", - "ConjugateGradientDescent" - ] - }, - "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", + "Name": "FeatureSelectSeed", + "Type": "Int", + "Desc": "The seed of the active feature selection", "Aliases": [ - "oa" + "r3" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "GradientDescent" + "Default": 123 }, { - "Name": "EarlyStoppingRule", - "Type": { - "Kind": "Component", - "ComponentKind": "EarlyStoppingCriterion" - }, - "Desc": "Early stopping rule. (Validation set (/valid) is required.)", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "esr" + "e" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "EarlyStoppingMetrics", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "esmt" + "ps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": -1 }, { - "Name": "EnablePruning", + "Name": "DiskTranspose", "Type": "Bool", - "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "pruning" + "dt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "IsNullable": true, + "Default": null }, { - "Name": "UseTolerantPruning", + "Name": "FeatureFlocks", "Type": "Bool", - "Desc": "Use window and tolerance for pruning", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "prtol" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true }, { - "Name": "PruningThreshold", - "Type": "Float", - "Desc": "The tolerance threshold for pruning", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "prth" + "cat" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.004 + "Default": false }, { - "Name": "PruningWindowSize", + "Name": "MaxCategoricalGroupsPerNode", "Type": "Int", - "Desc": "The moving window size for pruning", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "prws" + "mcg" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 5 + "Default": 64 }, { - "Name": "Shrinkage", - "Type": "Float", - "Desc": "Shrinkage", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "shrk" + "maxcat" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.025, - "Max": 4.0, - "IsLogScale": true - } + "Default": 64 }, { - "Name": "DropoutRate", + "Name": "MinDocsPercentageForCategoricalSplit", "Type": "Float", - "Desc": "Dropout rate for tree regularization", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "tdrop" + "mdop" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 1E-09, - 0.05, - 0.1, - 0.2 - ] - } + "Default": 0.001 }, { - "Name": "GetDerivativesSampleRate", + "Name": "MinDocsForCategoricalSplit", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "Minimum categorical doc count in a bin to consider for a split.", "Aliases": [ - "sr" + "mdo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 100 }, { - "Name": "WriteLastEnsemble", - "Type": "Bool", - "Desc": "Write the last ensemble instead of the one determined by early stopping", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "hl" + "bias" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "MaxTreeOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single tree output", + "Name": "Bundling", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "AggregateLowPopulation", + "Adjacent" + ] + }, + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "mo" + "bundle" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 100.0 + "Default": "None" }, { - "Name": "RandomStart", - "Type": "Bool", - "Desc": "Training starts from random ordering (determined by /r1)", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "rs" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 255 }, { - "Name": "FilterZeroLambdas", - "Type": "Bool", - "Desc": "Filter zero lambdas during training", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "fzl" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.7 }, { - "Name": "BaselineScoresFormula", - "Type": "String", - "Desc": "Freeform defining the scores that should be used as the baseline ranker", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "basescores" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "BaselineAlphaRisk", - "Type": "String", - "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "basealpha" + "frup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "PositionDiscountFreeform", - "Type": "String", - "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "pdff" + "gainconf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": 0.0 }, { - "Name": "ParallelTrainer", - "Type": { - "Kind": "Component", - "ComponentKind": "ParallelTraining" - }, - "Desc": "Allows to choose Parallel FastTree Learning Algorithm", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "parag" + "smtemp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "Single" - } + "Default": 0.0 }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "t" + "et" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": false }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "FeatureFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "r1" + "ff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": 0.7 }, { - "Name": "FeatureSelectSeed", + "Name": "BaggingSize", "Type": "Int", - "Desc": "The seed of the active feature selection", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "r3" + "bag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": 1 }, { - "Name": "EntropyCoefficient", + "Name": "BaggingTrainFraction", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "e" + "bagfrac" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.7 }, { - "Name": "HistogramPoolSize", - "Type": "Int", - "Desc": "The number of histograms in the pool (between 2 and numLeaves)", + "Name": "SplitFraction", + "Type": "Float", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "ps" + "sf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": -1 + "Default": 0.7 }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", - "Aliases": [ - "dt" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", - "Aliases": [ - "flocks" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "CategoricalSplit", - "Type": "Bool", - "Desc": "Whether to do split based on multiple categorical feature values.", - "Aliases": [ - "cat" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "MaxCategoricalGroupsPerNode", - "Type": "Int", - "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", - "Aliases": [ - "mcg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 64 - }, - { - "Name": "MaxCategoricalSplitPoints", - "Type": "Int", - "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", - "Aliases": [ - "maxcat" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 64 - }, - { - "Name": "MinDocsPercentageForCategoricalSplit", - "Type": "Float", - "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", - "Aliases": [ - "mdop" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.001 - }, - { - "Name": "MinDocsForCategoricalSplit", - "Type": "Int", - "Desc": "Minimum categorical doc count in a bin to consider for a split.", - "Aliases": [ - "mdo" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100 - }, - { - "Name": "Bias", - "Type": "Float", - "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", - "Aliases": [ - "bias" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "Bundling", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "AggregateLowPopulation", - "Adjacent" - ] - }, - "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", - "Aliases": [ - "bundle" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "None" - }, - { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", - "Aliases": [ - "mb" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 255 - }, - { - "Name": "SparsifyThreshold", - "Type": "Float", - "Desc": "Sparsity level needed to use sparse feature representation", - "Aliases": [ - "sp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "FeatureFirstUsePenalty", - "Type": "Float", - "Desc": "The feature first use penalty coefficient", - "Aliases": [ - "ffup" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "FeatureReusePenalty", - "Type": "Float", - "Desc": "The feature re-use penalty (regularization) coefficient", - "Aliases": [ - "frup" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "GainConfidenceLevel", - "Type": "Float", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", - "Aliases": [ - "gainconf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "SoftmaxTemperature", - "Type": "Float", - "Desc": "The temperature of the randomized softmax distribution for choosing the feature", - "Aliases": [ - "smtemp" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "ExecutionTimes", - "Type": "Bool", - "Desc": "Print execution time breakdown to stdout", - "Aliases": [ - "et" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "FeatureFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each iteration", - "Aliases": [ - "ff" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 - }, - { - "Name": "BaggingSize", - "Type": "Int", - "Desc": "Number of trees in each bag (0 for disabling bagging)", - "Aliases": [ - "bag" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - }, - { - "Name": "BaggingTrainFraction", - "Type": "Float", - "Desc": "Percentage of training examples used in each bag", - "Aliases": [ - "bagfrac" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.7 - }, - { - "Name": "SplitFraction", - "Type": "Float", - "Desc": "The fraction of features (chosen randomly) to use on each split", - "Aliases": [ - "sf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 - }, - { - "Name": "Smoothing", - "Type": "Float", - "Desc": "Smoothing paramter for tree regularization", + "Name": "Smoothing", + "Type": "Float", + "Desc": "Smoothing paramter for tree regularization", "Aliases": [ "s" ], @@ -5878,15 +5942,15 @@ "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeRanker", - "Desc": "Trains gradient boosted decision trees to the LambdaRank quasi-gradient.", - "FriendlyName": "FastTree (Boosted Trees) Ranking", - "ShortName": "ftrank", + "Name": "Trainers.FastTreeBinaryClassifier", + "Desc": "Uses a logit-boost boosted tree learner to perform binary classification.", + "FriendlyName": "FastTree (Boosted Trees) Classification", + "ShortName": "ftc", "Inputs": [ { "Name": "NumTrees", @@ -6065,92 +6129,11 @@ "Default": "Auto" }, { - "Name": "CustomGains", - "Type": "String", - "Desc": "Comma seperated list of gains associated to each relevance label.", - "Aliases": [ - "gains" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "0,3,7,15,31" - }, - { - "Name": "TrainDcg", - "Type": "Bool", - "Desc": "Train DCG instead of NDCG", - "Aliases": [ - "dcg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "SortingAlgorithm", - "Type": "String", - "Desc": "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", - "Aliases": [ - "sort" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "DescendingStablePessimistic" - }, - { - "Name": "LambdaMartMaxTruncation", - "Type": "Int", - "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", - "Aliases": [ - "n" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100 - }, - { - "Name": "ShiftedNdcg", - "Type": "Bool", - "Desc": "Use shifted NDCG", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "CostFunctionParam", - "Type": "Char", - "Desc": "Cost function parameter (w/c)", - "Aliases": [ - "cf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "w" - }, - { - "Name": "DistanceWeight2", - "Type": "Bool", - "Desc": "Distance weight 2 adjustment to cost", - "Aliases": [ - "dw" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "NormalizeQueryLambdas", + "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Normalize query lambdas", + "Desc": "Should we use derivatives optimized for unbalanced sets", "Aliases": [ - "nql" + "us" ], "Required": false, "SortOrder": 150.0, @@ -6249,7 +6232,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 0 }, { "Name": "EnablePruning", @@ -6871,15 +6854,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRankingOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeRegressor", - "Desc": "Trains gradient boosted decision trees to fit target values using least-squares.", - "FriendlyName": "FastTree (Boosted Trees) Regression", - "ShortName": "ftr", + "Name": "Trainers.FastTreeRanker", + "Desc": "Trains gradient boosted decision trees to the LambdaRank quasi-gradient.", + "FriendlyName": "FastTree (Boosted Trees) Ranking", + "ShortName": "ftrank", "Inputs": [ { "Name": "NumTrees", @@ -7057,6 +7040,99 @@ "IsNullable": false, "Default": "Auto" }, + { + "Name": "CustomGains", + "Type": "String", + "Desc": "Comma seperated list of gains associated to each relevance label.", + "Aliases": [ + "gains" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "0,3,7,15,31" + }, + { + "Name": "TrainDcg", + "Type": "Bool", + "Desc": "Train DCG instead of NDCG", + "Aliases": [ + "dcg" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "SortingAlgorithm", + "Type": "String", + "Desc": "The sorting algorithm to use for DCG and LambdaMart calculations [DescendingStablePessimistic/DescendingStable/DescendingReverse/DescendingDotNet]", + "Aliases": [ + "sort" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "DescendingStablePessimistic" + }, + { + "Name": "LambdaMartMaxTruncation", + "Type": "Int", + "Desc": "max-NDCG truncation to use in the Lambda Mart algorithm", + "Aliases": [ + "n" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 100 + }, + { + "Name": "ShiftedNdcg", + "Type": "Bool", + "Desc": "Use shifted NDCG", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "CostFunctionParam", + "Type": "Char", + "Desc": "Cost function parameter (w/c)", + "Aliases": [ + "cf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "w" + }, + { + "Name": "DistanceWeight2", + "Type": "Bool", + "Desc": "Distance weight 2 adjustment to cost", + "Aliases": [ + "dw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "NormalizeQueryLambdas", + "Type": "Bool", + "Desc": "Normalize query lambdas", + "Aliases": [ + "nql" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", @@ -7771,15 +7847,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IRankingOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.FastTreeTweedieRegressor", - "Desc": "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.", - "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", - "ShortName": "fttweedie", + "Name": "Trainers.FastTreeRegressor", + "Desc": "Trains gradient boosted decision trees to fit target values using least-squares.", + "FriendlyName": "FastTree (Boosted Trees) Regression", + "ShortName": "ftr", "Inputs": [ { "Name": "NumTrees", @@ -7957,15 +8033,6 @@ "IsNullable": false, "Default": "Auto" }, - { - "Name": "Index", - "Type": "Float", - "Desc": "Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.5 - }, { "Name": "BestStepRankingRegressionTrees", "Type": "Bool", @@ -8058,7 +8125,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 1 }, { "Name": "EnablePruning", @@ -8685,28 +8752,28 @@ ] }, { - "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", - "FriendlyName": "Generalized Additive Model for Binary Classification", - "ShortName": "gam", + "Name": "Trainers.FastTreeTweedieRegressor", + "Desc": "Trains gradient boosted decision trees to fit target values using a Tweedie loss function. This learner is a generalization of Poisson, compound Poisson, and gamma regression.", + "FriendlyName": "FastTree (Boosted Trees) Tweedie Regression", + "ShortName": "fttweedie", "Inputs": [ { - "Name": "NumIterations", + "Name": "NumTrees", "Type": "Int", - "Desc": "Total number of iterations over all features", + "Desc": "Total number of decision trees to create in the ensemble", "Aliases": [ "iter" ], "Required": false, "SortOrder": 1.0, "IsNullable": false, - "Default": 9500, + "Default": 100, "SweepRange": { "RangeType": "Discrete", "Values": [ - 200, - 1500, - 9500 + 20, + 100, + 500 ] } }, @@ -8721,6 +8788,25 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "NumLeaves", + "Type": "Int", + "Desc": "The max number of leaves in each regression tree", + "Aliases": [ + "nl" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 2, + "Max": 128, + "StepSize": 4.0, + "IsLogScale": true + } + }, { "Name": "FeatureColumn", "Type": "String", @@ -8734,11 +8820,11 @@ "Default": "Features" }, { - "Name": "MinDocuments", + "Name": "MinDocumentsInLeafs", "Type": "Int", - "Desc": "Minimum number of training instances required to form a partition", + "Desc": "The minimal number of documents allowed in a leaf of a regression tree, out of the subsampled data", "Aliases": [ - "mi" + "mil" ], "Required": false, "SortOrder": 3.0, @@ -8775,11 +8861,11 @@ "Required": false, "SortOrder": 4.0, "IsNullable": false, - "Default": 0.002, + "Default": 0.2, "SweepRange": { "RangeType": "Float", - "Min": 0.001, - "Max": 0.1, + "Min": 0.025, + "Max": 0.4, "IsLogScale": true } }, @@ -8795,6 +8881,18 @@ "IsNullable": false, "Default": "Weight" }, + { + "Name": "GroupIdColumn", + "Type": "String", + "Desc": "Column to use for example groupId", + "Aliases": [ + "groupId" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "GroupId" + }, { "Name": "NormalizeFeatures", "Type": { @@ -8836,11 +8934,20 @@ "Default": "Auto" }, { - "Name": "UnbalancedSets", + "Name": "Index", + "Type": "Float", + "Desc": "Index parameter for the Tweedie distribution, in the range [1, 2]. 1 is Poisson loss, 2 is gamma loss, and intermediate values are compound Poisson loss.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.5 + }, + { + "Name": "BestStepRankingRegressionTrees", "Type": "Bool", - "Desc": "Should we use derivatives optimized for unbalanced sets", + "Desc": "Use best regression step trees?", "Aliases": [ - "us" + "bsr" ], "Required": false, "SortOrder": 150.0, @@ -8848,34 +8955,35 @@ "Default": false }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "UseLineSearch", + "Type": "Bool", + "Desc": "Should we use line search for a step size", + "Aliases": [ + "ls" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": false }, { - "Name": "MaxCalibrationExamples", + "Name": "NumPostBracketSteps", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "Number of post-bracket line search steps", + "Aliases": [ + "lssteps" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": 0 }, { - "Name": "EntropyCoefficient", + "Name": "MinStepSize", "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Desc": "Minimum line search step size", "Aliases": [ - "e" + "minstep" ], "Required": false, "SortOrder": 150.0, @@ -8883,293 +8991,251 @@ "Default": 0.0 }, { - "Name": "GainConfidenceLevel", - "Type": "Int", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "OptimizationAlgorithm", + "Type": { + "Kind": "Enum", + "Values": [ + "GradientDescent", + "AcceleratedGradientDescent", + "ConjugateGradientDescent" + ] + }, + "Desc": "Optimization algorithm to be used (GradientDescent, AcceleratedGradientDescent)", "Aliases": [ - "gainconf" + "oa" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": "GradientDescent" }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of threads to use", + "Name": "EarlyStoppingRule", + "Type": { + "Kind": "Component", + "ComponentKind": "EarlyStoppingCriterion" + }, + "Desc": "Early stopping rule. (Validation set (/valid) is required.)", "Aliases": [ - "t" + "esr" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "EarlyStoppingMetrics", + "Type": "Int", + "Desc": "Early stopping metrics. (For regression, 1: L1, 2:L2; for ranking, 1:NDCG@1, 3:NDCG@3)", "Aliases": [ - "dt" + "esmt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0 }, { - "Name": "MaxBins", - "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Name": "EnablePruning", + "Type": "Bool", + "Desc": "Enable post-training pruning to avoid overfitting. (a validation set is required)", "Aliases": [ - "mb" + "pruning" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": false }, { - "Name": "MaxOutput", - "Type": "Float", - "Desc": "Upper bound on absolute value of single output", + "Name": "UseTolerantPruning", + "Type": "Bool", + "Desc": "Use window and tolerance for pruning", "Aliases": [ - "mo" + "prtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Infinity" + "Default": false }, { - "Name": "GetDerivativesSampleRate", - "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Name": "PruningThreshold", + "Type": "Float", + "Desc": "The tolerance threshold for pruning", "Aliases": [ - "sr" + "prth" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 0.004 }, { - "Name": "RngSeed", + "Name": "PruningWindowSize", "Type": "Int", - "Desc": "The seed of the random number generator", + "Desc": "The moving window size for pruning", "Aliases": [ - "r1" + "prws" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 123 + "Default": 5 }, { - "Name": "FeatureFlocks", - "Type": "Bool", - "Desc": "Whether to collectivize features during dataset preparation to speed up training", + "Name": "Shrinkage", + "Type": "Float", + "Desc": "Shrinkage", "Aliases": [ - "flocks" + "shrk" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IBinaryClassificationOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.GeneralizedAdditiveModelRegressor", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", - "FriendlyName": "Generalized Additive Model for Regression", - "ShortName": "gamr", - "Inputs": [ + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.025, + "Max": 4.0, + "IsLogScale": true + } + }, { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Total number of iterations over all features", + "Name": "DropoutRate", + "Type": "Float", + "Desc": "Dropout rate for tree regularization", "Aliases": [ - "iter" + "tdrop" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 9500, + "Default": 0.0, "SweepRange": { "RangeType": "Discrete", "Values": [ - 200, - 1500, - 9500 + 0.0, + 1E-09, + 0.05, + 0.1, + 0.2 ] } }, { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", + "Name": "GetDerivativesSampleRate", + "Type": "Int", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "data" + "sr" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "WriteLastEnsemble", + "Type": "Bool", + "Desc": "Write the last ensemble instead of the one determined by early stopping", "Aliases": [ - "feat" + "hl" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": false }, { - "Name": "MinDocuments", - "Type": "Int", - "Desc": "Minimum number of training instances required to form a partition", + "Name": "MaxTreeOutput", + "Type": "Float", + "Desc": "Upper bound on absolute value of single tree output", "Aliases": [ - "mi" + "mo" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 50 - ] - } + "Default": 100.0 }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "RandomStart", + "Type": "Bool", + "Desc": "Training starts from random ordering (determined by /r1)", "Aliases": [ - "lab" + "rs" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": false }, { - "Name": "LearningRates", - "Type": "Float", - "Desc": "The learning rate", + "Name": "FilterZeroLambdas", + "Type": "Bool", + "Desc": "Filter zero lambdas during training", "Aliases": [ - "lr" + "fzl" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.002, - "SweepRange": { - "RangeType": "Float", - "Min": 0.001, - "Max": 0.1, - "IsLogScale": true - } + "Default": false }, { - "Name": "WeightColumn", + "Name": "BaselineScoresFormula", "Type": "String", - "Desc": "Column to use for example weight", - "Aliases": [ - "weight" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": "Weight" - }, - { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Desc": "Freeform defining the scores that should be used as the baseline ranker", "Aliases": [ - "norm" + "basescores" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": null }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "BaselineAlphaRisk", + "Type": "String", + "Desc": "Baseline alpha for tradeoffs of risk (0 is normal training)", "Aliases": [ - "cache" + "basealpha" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": null }, { - "Name": "EntropyCoefficient", - "Type": "Float", - "Desc": "The entropy (regularization) coefficient between 0 and 1", + "Name": "PositionDiscountFreeform", + "Type": "String", + "Desc": "The discount freeform which specifies the per position discounts of documents in a query (uses a single variable P for position where P=0 is first position)", "Aliases": [ - "e" + "pdff" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": null }, { - "Name": "GainConfidenceLevel", - "Type": "Int", - "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", + "Name": "ParallelTrainer", + "Type": { + "Kind": "Component", + "ComponentKind": "ParallelTraining" + }, + "Desc": "Allows to choose Parallel FastTree Learning Algorithm", "Aliases": [ - "gainconf" + "parag" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": { + "Name": "Single" + } }, { "Name": "NumThreads", @@ -9184,64 +9250,64 @@ "Default": null }, { - "Name": "DiskTranspose", - "Type": "Bool", - "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", + "Name": "RngSeed", + "Type": "Int", + "Desc": "The seed of the random number generator", "Aliases": [ - "dt" + "r1" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 123 }, { - "Name": "MaxBins", + "Name": "FeatureSelectSeed", "Type": "Int", - "Desc": "Maximum number of distinct values (bins) per feature", + "Desc": "The seed of the active feature selection", "Aliases": [ - "mb" + "r3" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 255 + "Default": 123 }, { - "Name": "MaxOutput", + "Name": "EntropyCoefficient", "Type": "Float", - "Desc": "Upper bound on absolute value of single output", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "mo" + "e" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Infinity" + "Default": 0.0 }, { - "Name": "GetDerivativesSampleRate", + "Name": "HistogramPoolSize", "Type": "Int", - "Desc": "Sample each query 1 in k times in the GetDerivatives function", + "Desc": "The number of histograms in the pool (between 2 and numLeaves)", "Aliases": [ - "sr" + "ps" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": -1 }, { - "Name": "RngSeed", - "Type": "Int", - "Desc": "The seed of the random number generator", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "r1" + "dt" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 123 + "IsNullable": true, + "Default": null }, { "Name": "FeatureFlocks", @@ -9254,459 +9320,326 @@ "SortOrder": 150.0, "IsNullable": false, "Default": true - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", - "ITrainerInput" - ], - "OutputKind": [ - "IRegressionOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.KMeansPlusPlusClusterer", - "Desc": "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.", - "FriendlyName": "KMeans++ Clustering", - "ShortName": "KM", - "Inputs": [ - { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "CategoricalSplit", + "Type": "Bool", + "Desc": "Whether to do split based on multiple categorical feature values.", "Aliases": [ - "feat" + "cat" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": false }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", + "Name": "MaxCategoricalGroupsPerNode", + "Type": "Int", + "Desc": "Maximum categorical split groups to consider when splitting on a categorical feature. Split groups are a collection of split points. This is used to reduce overfitting when there many categorical features.", "Aliases": [ - "weight" + "mcg" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Weight" + "Default": 64 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "MaxCategoricalSplitPoints", + "Type": "Int", + "Desc": "Maximum categorical split points to consider when splitting on a categorical feature.", "Aliases": [ - "norm" + "maxcat" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 64 }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "MinDocsPercentageForCategoricalSplit", + "Type": "Float", + "Desc": "Minimum categorical docs percentage in a bin to consider for a split.", "Aliases": [ - "cache" + "mdop" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.001 }, { - "Name": "K", + "Name": "MinDocsForCategoricalSplit", "Type": "Int", - "Desc": "The number of clusters", - "Required": false, - "SortOrder": 50.0, + "Desc": "Minimum categorical doc count in a bin to consider for a split.", + "Aliases": [ + "mdo" + ], + "Required": false, + "SortOrder": 150.0, "IsNullable": false, - "Default": 5, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 10, - 20, - 40 - ] - } + "Default": 100 }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Name": "Bias", + "Type": "Float", + "Desc": "Bias for calculating gradient for each feature bin for a categorical feature.", "Aliases": [ - "nt", - "t", - "threads" + "bias" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": true, - "Default": null + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 }, { - "Name": "InitAlgorithm", + "Name": "Bundling", "Type": { "Kind": "Enum", "Values": [ - "KMeansPlusPlus", - "Random", - "KMeansParallel" + "None", + "AggregateLowPopulation", + "Adjacent" ] }, - "Desc": "Cluster initialization algorithm", + "Desc": "Bundle low population bins. Bundle.None(0): no bundling, Bundle.AggregateLowPopulation(1): Bundle low population, Bundle.Adjacent(2): Neighbor low population bundle.", "Aliases": [ - "init" + "bundle" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "KMeansParallel" + "Default": "None" }, { - "Name": "OptTol", - "Type": "Float", - "Desc": "Tolerance parameter for trainer convergence. Lower = slower, more accurate", + "Name": "MaxBins", + "Type": "Int", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "ot" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1E-07 + "Default": 255 }, { - "Name": "MaxIterations", - "Type": "Int", - "Desc": "Maximum number of iterations.", + "Name": "SparsifyThreshold", + "Type": "Float", + "Desc": "Sparsity level needed to use sparse feature representation", "Aliases": [ - "maxiter" + "sp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000 + "Default": 0.7 }, { - "Name": "AccelMemBudgetMb", - "Type": "Int", - "Desc": "Memory budget (in MBs) to use for KMeans acceleration", + "Name": "FeatureFirstUsePenalty", + "Type": "Float", + "Desc": "The feature first use penalty coefficient", "Aliases": [ - "accelMemBudgetMb" + "ffup" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 4096 - } - ], - "Outputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The trained model" - } - ], - "InputKind": [ - "IUnsupervisedTrainerWithWeight", - "ITrainerInput" - ], - "OutputKind": [ - "IClusteringOutput", - "ITrainerOutput" - ] - }, - { - "Name": "Trainers.LinearSvmBinaryClassifier", - "Desc": "Train a linear SVM.", - "FriendlyName": "SVM (Pegasos-Linear)", - "ShortName": "svm", - "Inputs": [ - { - "Name": "TrainingData", - "Type": "DataView", - "Desc": "The data to be used for training", - "Aliases": [ - "data" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Default": 0.0 }, { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", + "Name": "FeatureReusePenalty", + "Type": "Float", + "Desc": "The feature re-use penalty (regularization) coefficient", "Aliases": [ - "feat" + "frup" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Features" + "Default": 0.0 }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", + "Name": "GainConfidenceLevel", + "Type": "Float", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "lab" + "gainconf" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Label" + "Default": 0.0 }, { - "Name": "NormalizeFeatures", - "Type": { - "Kind": "Enum", - "Values": [ - "No", - "Warn", - "Auto", - "Yes" - ] - }, - "Desc": "Normalize option for the feature column", + "Name": "SoftmaxTemperature", + "Type": "Float", + "Desc": "The temperature of the randomized softmax distribution for choosing the feature", "Aliases": [ - "norm" + "smtemp" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": 0.0 }, { - "Name": "Caching", - "Type": { - "Kind": "Enum", - "Values": [ - "Auto", - "Memory", - "Disk", - "None" - ] - }, - "Desc": "Whether learner should cache input training data", + "Name": "ExecutionTimes", + "Type": "Bool", + "Desc": "Print execution time breakdown to stdout", "Aliases": [ - "cache" + "et" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Auto" + "Default": false }, { - "Name": "Lambda", + "Name": "FeatureFraction", "Type": "Float", - "Desc": "Regularizer constant", + "Desc": "The fraction of features (chosen randomly) to use on each iteration", "Aliases": [ - "lambda" + "ff" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.001, - "SweepRange": { - "RangeType": "Float", - "Min": 1E-05, - "Max": 0.1, - "StepSize": 10.0, - "IsLogScale": true - } + "Default": 1.0 }, { - "Name": "PerformProjection", - "Type": "Bool", - "Desc": "Perform projection to unit-ball? Typically used with batch size > 1.", + "Name": "BaggingSize", + "Type": "Int", + "Desc": "Number of trees in each bag (0 for disabling bagging)", "Aliases": [ - "project" + "bag" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": 0 }, { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Number of iterations", + "Name": "BaggingTrainFraction", + "Type": "Float", + "Desc": "Percentage of training examples used in each bag", "Aliases": [ - "iter" + "bagfrac" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 100, - "StepSize": 10.0, - "IsLogScale": true - } + "Default": 0.7 }, { - "Name": "InitWtsDiameter", + "Name": "SplitFraction", "Type": "Float", - "Desc": "Init weights diameter", + "Desc": "The fraction of features (chosen randomly) to use on each split", "Aliases": [ - "initwts" + "sf" ], "Required": false, - "SortOrder": 140.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } + "Default": 1.0 }, { - "Name": "NoBias", - "Type": "Bool", - "Desc": "No bias", + "Name": "Smoothing", + "Type": "Float", + "Desc": "Smoothing paramter for tree regularization", + "Aliases": [ + "s" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": 0.0 }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "AllowEmptyTrees", + "Type": "Bool", + "Desc": "When a root split is impossible, allow training to proceed", + "Aliases": [ + "allowempty", + "dummies" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" - } + "Default": true }, { - "Name": "MaxCalibrationExamples", + "Name": "FeatureCompressionLevel", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "The level of feature compression to use", + "Aliases": [ + "fcomp" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": 1 }, { - "Name": "InitialWeights", - "Type": "String", - "Desc": "Initial Weights and bias, comma-separated", + "Name": "CompressEnsemble", + "Type": "Bool", + "Desc": "Compress the tree Ensemble", "Aliases": [ - "initweights" + "cmp" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "Shuffle", + "Name": "MaxTreesAfterCompression", + "Type": "Int", + "Desc": "Maximum Number of trees after compression", + "Aliases": [ + "cmpmax" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": -1 + }, + { + "Name": "PrintTestGraph", "Type": "Bool", - "Desc": "Whether to shuffle for each training iteration", + "Desc": "Print metrics graph for the first test set", "Aliases": [ - "shuf" + "graph" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": false }, { - "Name": "StreamingCacheSize", - "Type": "Int", - "Desc": "Size of cache when trained in Scope", + "Name": "PrintTrainValidGraph", + "Type": "Bool", + "Desc": "Print Train and Validation metrics in graph", "Aliases": [ - "cache" + "graphtv" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": false }, { - "Name": "BatchSize", + "Name": "TestFrequency", "Type": "Int", - "Desc": "Batch size", + "Desc": "Calculate metric values for train/valid/test every k rounds", "Aliases": [ - "batch" + "tf" ], "Required": false, - "SortOrder": 190.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1 + "Default": 2147483647 } ], "Outputs": [ @@ -9717,20 +9650,42 @@ } ], "InputKind": [ + "ITrainerInputWithGroupId", + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.LogisticRegressionBinaryClassifier", - "Desc": "Train a logistic regression binary model", - "FriendlyName": "Logistic Regression", - "ShortName": "lr", + "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "FriendlyName": "Generalized Additive Model for Binary Classification", + "ShortName": "gam", "Inputs": [ + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Total number of iterations over all features", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9500, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 200, + 1500, + 9500 + ] + } + }, { "Name": "TrainingData", "Type": "DataView", @@ -9754,6 +9709,26 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "MinDocuments", + "Type": "Int", + "Desc": "Minimum number of training instances required to form a partition", + "Aliases": [ + "mi" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } + }, { "Name": "LabelColumn", "Type": "String", @@ -9766,6 +9741,24 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", + "Aliases": [ + "lr" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 0.002, + "SweepRange": { + "RangeType": "Float", + "Min": 0.001, + "Max": 0.1, + "IsLogScale": true + } + }, { "Name": "WeightColumn", "Type": "String", @@ -9819,205 +9812,147 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStats", + "Name": "UnbalancedSets", "Type": "Bool", - "Desc": "Show statistics of training examples.", + "Desc": "Should we use derivatives optimized for unbalanced sets", "Aliases": [ - "stat" + "us" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, "Default": false }, { - "Name": "L2Weight", - "Type": "Float", - "Desc": "L2 regularization weight", - "Aliases": [ - "l2" - ], + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 + "Default": { + "Name": "PlattCalibrator" } }, { - "Name": "L1Weight", - "Type": "Float", - "Desc": "L1 regularization weight", - "Aliases": [ - "l1" - ], + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 - } + "Default": 1000000 }, { - "Name": "OptTol", + "Name": "EntropyCoefficient", "Type": "Float", - "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "ot" + "e" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1E-07, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0001, - 1E-07 - ] - } + "Default": 0.0 }, { - "Name": "MemorySize", + "Name": "GainConfidenceLevel", "Type": "Int", - "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "m" + "gainconf" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 20, - 50 - ] - } + "Default": 0 }, { - "Name": "EnforceNonNegativity", - "Type": "Bool", - "Desc": "Enforce non-negative weights", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "nn" + "t" ], "Required": false, - "SortOrder": 90.0, - "IsNullable": false, - "Default": false + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "InitWtsDiameter", - "Type": "Float", - "Desc": "Init weights diameter", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "initwts" + "dt" ], "Required": false, - "SortOrder": 140.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "MaxIterations", + "Name": "MaxBins", "Type": "Int", - "Desc": "Maximum iterations.", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "maxiter" + "mb" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 2147483647, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 2147483647 - } + "Default": 255 }, { - "Name": "SgdInitializationTolerance", + "Name": "MaxOutput", "Type": "Float", - "Desc": "Run SGD to initialize LR weights, converging to this tolerance", - "Aliases": [ - "sgd" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "Quiet", - "Type": "Bool", - "Desc": "If set to true, produce no output during training.", + "Desc": "Upper bound on absolute value of single output", "Aliases": [ - "q" + "mo" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": "Infinity" }, { - "Name": "UseThreads", - "Type": "Bool", - "Desc": "Whether or not to use threads. Default is true", + "Name": "GetDerivativesSampleRate", + "Type": "Int", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "t" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1 }, { - "Name": "NumThreads", + "Name": "RngSeed", "Type": "Int", - "Desc": "Number of threads", + "Desc": "The seed of the random number generator", "Aliases": [ - "nt" + "r1" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 123 }, { - "Name": "DenseOptimizer", + "Name": "FeatureFlocks", "Type": "Bool", - "Desc": "Force densification of the internal optimization vectors", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "do" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": true } ], "Outputs": [ @@ -10038,11 +9973,31 @@ ] }, { - "Name": "Trainers.LogisticRegressionClassifier", - "Desc": "Train a logistic regression multi class model", - "FriendlyName": "Multi-class Logistic Regression", - "ShortName": "mlr", + "Name": "Trainers.GeneralizedAdditiveModelRegressor", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "FriendlyName": "Generalized Additive Model for Regression", + "ShortName": "gamr", "Inputs": [ + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Total number of iterations over all features", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9500, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 200, + 1500, + 9500 + ] + } + }, { "Name": "TrainingData", "Type": "DataView", @@ -10066,6 +10021,26 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "MinDocuments", + "Type": "Int", + "Desc": "Minimum number of training instances required to form a partition", + "Aliases": [ + "mi" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 50 + ] + } + }, { "Name": "LabelColumn", "Type": "String", @@ -10078,6 +10053,24 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "LearningRates", + "Type": "Float", + "Desc": "The learning rate", + "Aliases": [ + "lr" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 0.002, + "SweepRange": { + "RangeType": "Float", + "Min": 0.001, + "Max": 0.1, + "IsLogScale": true + } + }, { "Name": "WeightColumn", "Type": "String", @@ -10131,205 +10124,112 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStats", - "Type": "Bool", - "Desc": "Show statistics of training examples.", + "Name": "EntropyCoefficient", + "Type": "Float", + "Desc": "The entropy (regularization) coefficient between 0 and 1", "Aliases": [ - "stat" + "e" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 0.0 }, { - "Name": "L2Weight", - "Type": "Float", - "Desc": "L2 regularization weight", + "Name": "GainConfidenceLevel", + "Type": "Int", + "Desc": "Tree fitting gain confidence requirement (should be in the range [0,1) ).", "Aliases": [ - "l2" + "gainconf" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 - } + "Default": 0 }, { - "Name": "L1Weight", - "Type": "Float", - "Desc": "L1 regularization weight", + "Name": "NumThreads", + "Type": "Int", + "Desc": "The number of threads to use", "Aliases": [ - "l1" + "t" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 4 - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "OptTol", - "Type": "Float", - "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Name": "DiskTranspose", + "Type": "Bool", + "Desc": "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", "Aliases": [ - "ot" + "dt" ], "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1E-07, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0001, - 1E-07 - ] - } + "SortOrder": 150.0, + "IsNullable": true, + "Default": null }, { - "Name": "MemorySize", + "Name": "MaxBins", "Type": "Int", - "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", + "Desc": "Maximum number of distinct values (bins) per feature", "Aliases": [ - "m" + "mb" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 20, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 5, - 20, - 50 - ] - } + "Default": 255 }, { - "Name": "EnforceNonNegativity", - "Type": "Bool", - "Desc": "Enforce non-negative weights", + "Name": "MaxOutput", + "Type": "Float", + "Desc": "Upper bound on absolute value of single output", "Aliases": [ - "nn" + "mo" ], "Required": false, - "SortOrder": 90.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false - }, - { - "Name": "InitWtsDiameter", - "Type": "Float", - "Desc": "Init weights diameter", - "Aliases": [ - "initwts" - ], - "Required": false, - "SortOrder": 140.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 1.0, - "NumSteps": 5 - } + "Default": "Infinity" }, { - "Name": "MaxIterations", + "Name": "GetDerivativesSampleRate", "Type": "Int", - "Desc": "Maximum iterations.", - "Aliases": [ - "maxiter" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 2147483647, - "SweepRange": { - "RangeType": "Long", - "Min": 1, - "Max": 2147483647 - } - }, - { - "Name": "SgdInitializationTolerance", - "Type": "Float", - "Desc": "Run SGD to initialize LR weights, converging to this tolerance", + "Desc": "Sample each query 1 in k times in the GetDerivatives function", "Aliases": [ - "sgd" + "sr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0 + "Default": 1 }, { - "Name": "Quiet", - "Type": "Bool", - "Desc": "If set to true, produce no output during training.", + "Name": "RngSeed", + "Type": "Int", + "Desc": "The seed of the random number generator", "Aliases": [ - "q" + "r1" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 123 }, { - "Name": "UseThreads", + "Name": "FeatureFlocks", "Type": "Bool", - "Desc": "Whether or not to use threads. Default is true", + "Desc": "Whether to collectivize features during dataset preparation to speed up training", "Aliases": [ - "t" + "flocks" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": true - }, - { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Number of threads", - "Aliases": [ - "nt" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "DenseOptimizer", - "Type": "Bool", - "Desc": "Force densification of the internal optimization vectors", - "Aliases": [ - "do" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } } ], "Outputs": [ @@ -10345,15 +10245,15 @@ "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.NaiveBayesClassifier", - "Desc": "Train a MultiClassNaiveBayesTrainer.", - "FriendlyName": "Multiclass Naive Bayes", - "ShortName": "MNB", + "Name": "Trainers.KMeansPlusPlusClusterer", + "Desc": "K-means is a popular clustering algorithm. With K-means, the data is clustered into a specified number of clusters in order to minimize the within-cluster sum of squares. K-means++ improves upon K-means by using a better method for choosing the initial cluster centers.", + "FriendlyName": "KMeans++ Clustering", + "ShortName": "KM", "Inputs": [ { "Name": "TrainingData", @@ -10379,16 +10279,16 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "WeightColumn", "Type": "String", - "Desc": "Column to use for labels", + "Desc": "Column to use for example weight", "Aliases": [ - "lab" + "weight" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": "Label" + "Default": "Weight" }, { "Name": "NormalizeFeatures", @@ -10429,6 +10329,93 @@ "SortOrder": 6.0, "IsNullable": false, "Default": "Auto" + }, + { + "Name": "K", + "Type": "Int", + "Desc": "The number of clusters", + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 5, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 5, + 10, + 20, + 40 + ] + } + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Aliases": [ + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "InitAlgorithm", + "Type": { + "Kind": "Enum", + "Values": [ + "KMeansPlusPlus", + "Random", + "KMeansParallel" + ] + }, + "Desc": "Cluster initialization algorithm", + "Aliases": [ + "init" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "KMeansParallel" + }, + { + "Name": "OptTol", + "Type": "Float", + "Desc": "Tolerance parameter for trainer convergence. Lower = slower, more accurate", + "Aliases": [ + "ot" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1E-07 + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations.", + "Aliases": [ + "maxiter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000 + }, + { + "Name": "AccelMemBudgetMb", + "Type": "Int", + "Desc": "Memory budget (in MBs) to use for KMeans acceleration", + "Aliases": [ + "accelMemBudgetMb" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 4096 } ], "Outputs": [ @@ -10439,19 +10426,19 @@ } ], "InputKind": [ - "ITrainerInputWithLabel", + "IUnsupervisedTrainerWithWeight", "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IClusteringOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.OnlineGradientDescentRegressor", - "Desc": "Train a Online gradient descent perceptron.", - "FriendlyName": "Stochastic Gradient Descent (Regression)", - "ShortName": "ogd", + "Name": "Trainers.LinearSvmBinaryClassifier", + "Desc": "Train a linear SVM.", + "FriendlyName": "SVM (Pegasos-Linear)", + "ShortName": "svm", "Inputs": [ { "Name": "TrainingData", @@ -10529,54 +10516,35 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "RegressionLossFunction" - }, - "Desc": "Loss Function", - "Aliases": [ - "loss" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": { - "Name": "SquaredLoss" - } - }, - { - "Name": "LearningRate", + "Name": "Lambda", "Type": "Float", - "Desc": "Learning rate", + "Desc": "Regularizer constant", "Aliases": [ - "lr" + "lambda" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": 0.1, + "Default": 0.001, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.01, - 0.1, - 0.5, - 1.0 - ] + "RangeType": "Float", + "Min": 1E-05, + "Max": 0.1, + "StepSize": 10.0, + "IsLogScale": true } }, { - "Name": "DecreaseLearningRate", + "Name": "PerformProjection", "Type": "Bool", - "Desc": "Decrease learning rate", + "Desc": "Perform projection to unit-ball? Typically used with batch size > 1.", "Aliases": [ - "decreaselr" + "project" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": true, + "Default": false, "SweepRange": { "RangeType": "Discrete", "Values": [ @@ -10585,23 +10553,6 @@ ] } }, - { - "Name": "L2RegularizerWeight", - "Type": "Float", - "Desc": "L2 Regularization Weight", - "Aliases": [ - "reg" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Float", - "Min": 0.0, - "Max": 0.5 - } - }, { "Name": "NumIterations", "Type": "Int", @@ -10640,76 +10591,43 @@ } }, { - "Name": "ResetWeightsAfterXExamples", - "Type": "Int", - "Desc": "Number of examples after which weights will be reset to the current average", - "Aliases": [ - "numreset" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "DoLazyUpdates", - "Type": "Bool", - "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", - "Aliases": [ - "lazy" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "RecencyGain", - "Type": "Float", - "Desc": "Extra weight given to more recent updates", - "Aliases": [ - "rg" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0 - }, - { - "Name": "RecencyGainMulti", + "Name": "NoBias", "Type": "Bool", - "Desc": "Whether Recency Gain is multiplicative (vs. additive)", - "Aliases": [ - "rgm" - ], + "Desc": "No bias", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "Averaged", - "Type": "Bool", - "Desc": "Do averaging?", - "Aliases": [ - "avg" - ], + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": { + "Name": "PlattCalibrator" + } }, { - "Name": "AveragedTolerance", - "Type": "Float", - "Desc": "The inexactness tolerance for averaging", - "Aliases": [ - "avgtol" - ], + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.01 + "Default": 1000000 }, { "Name": "InitialWeights", @@ -10753,6 +10671,18 @@ "SortOrder": 150.0, "IsNullable": false, "Default": 1000000 + }, + { + "Name": "BatchSize", + "Type": "Int", + "Desc": "Batch size", + "Aliases": [ + "batch" + ], + "Required": false, + "SortOrder": 190.0, + "IsNullable": false, + "Default": 1 } ], "Outputs": [ @@ -10767,15 +10697,15 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.PcaAnomalyDetector", - "Desc": "Train an PCA Anomaly model.", - "FriendlyName": "PCA Anomaly Detector", - "ShortName": "pcaAnom", + "Name": "Trainers.LogisticRegressionBinaryClassifier", + "Desc": "Train a logistic regression binary model", + "FriendlyName": "Logistic Regression", + "ShortName": "lr", "Inputs": [ { "Name": "TrainingData", @@ -10800,6 +10730,18 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, { "Name": "WeightColumn", "Type": "String", @@ -10853,30 +10795,79 @@ "Default": "Auto" }, { - "Name": "Rank", - "Type": "Int", - "Desc": "The number of components in the PCA", + "Name": "ShowTrainingStats", + "Type": "Bool", + "Desc": "Show statistics of training examples.", "Aliases": [ - "k" + "stat" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": 20, + "Default": false + }, + { + "Name": "L2Weight", + "Type": "Float", + "Desc": "L2 regularization weight", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 + } + }, + { + "Name": "L1Weight", + "Type": "Float", + "Desc": "L1 regularization weight", + "Aliases": [ + "l1" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 + } + }, + { + "Name": "OptTol", + "Type": "Float", + "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", + "Aliases": [ + "ot" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1E-07, "SweepRange": { "RangeType": "Discrete", "Values": [ - 10, - 20, - 40, - 80 + 0.0001, + 1E-07 ] } }, { - "Name": "Oversampling", + "Name": "MemorySize", "Type": "Int", - "Desc": "Oversampling parameter for randomized PCA training", + "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", + "Aliases": [ + "m" + ], "Required": false, "SortOrder": 50.0, "IsNullable": false, @@ -10884,65 +10875,149 @@ "SweepRange": { "RangeType": "Discrete", "Values": [ - 10, + 5, 20, - 40 + 50 ] } }, { - "Name": "Center", + "Name": "EnforceNonNegativity", "Type": "Bool", - "Desc": "If enabled, data is centered to be zero mean", + "Desc": "Enforce non-negative weights", "Aliases": [ - "center" + "nn" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 90.0, "IsNullable": false, - "Default": true, + "Default": false + }, + { + "Name": "InitWtsDiameter", + "Type": "Float", + "Desc": "Init weights diameter", + "Aliases": [ + "initwts" + ], + "Required": false, + "SortOrder": 140.0, + "IsNullable": false, + "Default": 0.0, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 } }, { - "Name": "Seed", + "Name": "MaxIterations", "Type": "Int", - "Desc": "The seed for random number generation", + "Desc": "Maximum iterations.", "Aliases": [ - "seed" + "maxiter" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null - } - ], - "Outputs": [ + "IsNullable": false, + "Default": 2147483647, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 2147483647 + } + }, { - "Name": "PredictorModel", + "Name": "SgdInitializationTolerance", + "Type": "Float", + "Desc": "Run SGD to initialize LR weights, converging to this tolerance", + "Aliases": [ + "sgd" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "Quiet", + "Type": "Bool", + "Desc": "If set to true, produce no output during training.", + "Aliases": [ + "q" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Whether or not to use threads. Default is true", + "Aliases": [ + "t" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "NumThreads", + "Type": "Int", + "Desc": "Number of threads", + "Aliases": [ + "nt" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "DenseOptimizer", + "Type": "Bool", + "Desc": "Force densification of the internal optimization vectors", + "Aliases": [ + "do" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + } + ], + "Outputs": [ + { + "Name": "PredictorModel", "Type": "PredictorModel", "Desc": "The trained model" } ], "InputKind": [ - "IUnsupervisedTrainerWithWeight", + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [ - "IAnomalyDetectionOutput", + "IBinaryClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.PoissonRegressor", - "Desc": "Train an Poisson regression model.", - "FriendlyName": "Poisson Regression", - "ShortName": "PR", + "Name": "Trainers.LogisticRegressionClassifier", + "Desc": "Train a logistic regression multi class model", + "FriendlyName": "Multi-class Logistic Regression", + "ShortName": "mlr", "Inputs": [ { "Name": "TrainingData", @@ -11031,6 +11106,18 @@ "IsNullable": false, "Default": "Auto" }, + { + "Name": "ShowTrainingStats", + "Type": "Bool", + "Desc": "Show statistics of training examples.", + "Aliases": [ + "stat" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": false + }, { "Name": "L2Weight", "Type": "Float", @@ -11234,40 +11321,16 @@ "ITrainerInput" ], "OutputKind": [ - "IRegressionOutput", + "IMulticlassClassificationOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", - "Desc": "Train an SDCA binary model.", - "FriendlyName": "Fast Linear (SA-SDCA)", - "ShortName": "SDCA", + "Name": "Trainers.NaiveBayesClassifier", + "Desc": "Train a MultiClassNaiveBayesTrainer.", + "FriendlyName": "Multiclass Naive Bayes", + "ShortName": "MNB", "Inputs": [ - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -11280,27 +11343,102 @@ "IsNullable": false }, { - "Name": "L1Threshold", - "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "l1" + "feat" ], "Required": false, "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 + "No", + "Warn", + "Auto", + "Yes" ] - } + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IMulticlassClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.OnlineGradientDescentRegressor", + "Desc": "Train a Online gradient descent perceptron.", + "FriendlyName": "Stochastic Gradient Descent (Regression)", + "ShortName": "ogd", + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { "Name": "FeatureColumn", @@ -11370,7 +11508,7 @@ "Name": "LossFunction", "Type": { "Kind": "Component", - "ComponentKind": "SDCAClassificationLossFunction" + "ComponentKind": "RegressionLossFunction" }, "Desc": "Loss Function", "Aliases": [ @@ -11380,151 +11518,217 @@ "SortOrder": 50.0, "IsNullable": false, "Default": { - "Name": "LogLoss" + "Name": "SquaredLoss" } }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Name": "LearningRate", + "Type": "Float", + "Desc": "Learning rate", "Aliases": [ - "nt", - "t", - "threads" + "lr" ], "Required": false, "SortOrder": 50.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "PositiveInstanceWeight", - "Type": "Float", - "Desc": "Apply weight to the positive class, for imbalanced data", - "Aliases": [ - "piw" + "IsNullable": false, + "Default": 0.1, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.1, + 0.5, + 1.0 + ] + } + }, + { + "Name": "DecreaseLearningRate", + "Type": "Bool", + "Desc": "Decrease learning rate", + "Aliases": [ + "decreaselr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "L2RegularizerWeight", + "Type": "Float", + "Desc": "L2 Regularization Weight", + "Aliases": [ + "reg" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 0.5 } }, { - "Name": "MaxCalibrationExamples", + "Name": "NumIterations", "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", + "Desc": "Number of iterations", + "Aliases": [ + "iter" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1000000 + "Default": 1, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 100, + "StepSize": 10.0, + "IsLogScale": true + } }, { - "Name": "ConvergenceTolerance", + "Name": "InitWtsDiameter", "Type": "Float", - "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Desc": "Init weights diameter", "Aliases": [ - "tol" + "initwts" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 140.0, "IsNullable": false, - "Default": 0.1, + "Default": 0.0, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.001, - 0.01, - 0.1, - 0.2 - ] + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 } }, { - "Name": "MaxIterations", + "Name": "ResetWeightsAfterXExamples", "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Desc": "Number of examples after which weights will be reset to the current average", "Aliases": [ - "iter" + "numreset" ], "Required": false, "SortOrder": 150.0, "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 10, - 20, - 100 - ] - } + "Default": null }, { - "Name": "Shuffle", + "Name": "DoLazyUpdates", "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Desc": "Instead of updating averaged weights on every example, only update when loss is nonzero", "Aliases": [ - "shuf" + "lazy" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] - } + "Default": true }, { - "Name": "CheckFrequency", - "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Name": "RecencyGain", + "Type": "Float", + "Desc": "Extra weight given to more recent updates", "Aliases": [ - "checkFreq" + "rg" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 0.0 }, { - "Name": "BiasLearningRate", + "Name": "RecencyGainMulti", + "Type": "Bool", + "Desc": "Whether Recency Gain is multiplicative (vs. additive)", + "Aliases": [ + "rgm" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "Averaged", + "Type": "Bool", + "Desc": "Do averaging?", + "Aliases": [ + "avg" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "AveragedTolerance", "Type": "Float", - "Desc": "The learning rate for adjusting bias from being regularized.", + "Desc": "The inexactness tolerance for averaging", "Aliases": [ - "blr" + "avgtol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0, + "Default": 0.01 + }, + { + "Name": "InitialWeights", + "Type": "String", + "Desc": "Initial Weights and bias, comma-separated", + "Aliases": [ + "initweights" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Whether to shuffle for each training iteration", + "Aliases": [ + "shuf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.0, - 0.01, - 0.1, - 1.0 + false, + true ] } + }, + { + "Name": "StreamingCacheSize", + "Type": "Int", + "Desc": "Size of cache when trained in Scope", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 } ], "Outputs": [ @@ -11539,40 +11743,16 @@ "ITrainerInput" ], "OutputKind": [ - "IBinaryClassificationOutput", + "IRegressionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.StochasticDualCoordinateAscentClassifier", - "Desc": "Train an SDCA multi class model", - "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)", - "ShortName": "sasdcamc", + "Name": "Trainers.PcaAnomalyDetector", + "Desc": "Train an PCA Anomaly model.", + "FriendlyName": "PCA Anomaly Detector", + "ShortName": "pcaAnom", "Inputs": [ - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -11585,34 +11765,11 @@ "IsNullable": false }, { - "Name": "L1Threshold", - "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "l1" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 - ] - } - }, - { - "Name": "FeatureColumn", - "Type": "String", - "Desc": "Column to use for features", - "Aliases": [ - "feat" + "feat" ], "Required": false, "SortOrder": 2.0, @@ -11620,16 +11777,16 @@ "Default": "Features" }, { - "Name": "LabelColumn", + "Name": "WeightColumn", "Type": "String", - "Desc": "Column to use for labels", + "Desc": "Column to use for example weight", "Aliases": [ - "lab" + "weight" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": "Label" + "Default": "Weight" }, { "Name": "NormalizeFeatures", @@ -11672,84 +11829,49 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "SDCAClassificationLossFunction" - }, - "Desc": "Loss Function", - "Aliases": [ - "loss" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": { - "Name": "LogLoss" - } - }, - { - "Name": "NumThreads", + "Name": "Rank", "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Desc": "The number of components in the PCA", "Aliases": [ - "nt", - "t", - "threads" + "k" ], "Required": false, "SortOrder": 50.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "ConvergenceTolerance", - "Type": "Float", - "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", - "Aliases": [ - "tol" - ], - "Required": false, - "SortOrder": 150.0, "IsNullable": false, - "Default": 0.1, + "Default": 20, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.001, - 0.01, - 0.1, - 0.2 + 10, + 20, + 40, + 80 ] } }, { - "Name": "MaxIterations", + "Name": "Oversampling", "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", - "Aliases": [ - "iter" - ], + "Desc": "Oversampling parameter for randomized PCA training", "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 20, "SweepRange": { "RangeType": "Discrete", "Values": [ - "", 10, 20, - 100 + 40 ] } }, { - "Name": "Shuffle", + "Name": "Center", "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Desc": "If enabled, data is centered to be zero mean", "Aliases": [ - "shuf" + "center" ], "Required": false, "SortOrder": 150.0, @@ -11764,37 +11886,16 @@ } }, { - "Name": "CheckFrequency", + "Name": "Seed", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Desc": "The seed for random number generation", "Aliases": [ - "checkFreq" + "seed" ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null - }, - { - "Name": "BiasLearningRate", - "Type": "Float", - "Desc": "The learning rate for adjusting bias from being regularized.", - "Aliases": [ - "blr" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.0, - 0.01, - 0.1, - 1.0 - ] - } } ], "Outputs": [ @@ -11805,44 +11906,20 @@ } ], "InputKind": [ - "ITrainerInputWithLabel", + "IUnsupervisedTrainerWithWeight", "ITrainerInput" ], "OutputKind": [ - "IMulticlassClassificationOutput", + "IAnomalyDetectionOutput", "ITrainerOutput" ] }, { - "Name": "Trainers.StochasticDualCoordinateAscentRegressor", - "Desc": "Train an SDCA regression model", - "FriendlyName": "Fast Linear Regression (SA-SDCA)", - "ShortName": "sasdcar", + "Name": "Trainers.PoissonRegressor", + "Desc": "Train an Poisson regression model.", + "FriendlyName": "Poisson Regression", + "ShortName": "PR", "Inputs": [ - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 1E-07, - 1E-06, - 1E-05, - 0.0001, - 0.001, - 0.01 - ] - } - }, { "Name": "TrainingData", "Type": "DataView", @@ -11854,29 +11931,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "L1Threshold", - "Type": "Float", - "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", - "Aliases": [ - "l1" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - "", - 0.0, - 0.25, - 0.5, - 0.75, - 1.0 - ] - } - }, { "Name": "FeatureColumn", "Type": "String", @@ -11901,6 +11955,18 @@ "IsNullable": false, "Default": "Label" }, + { + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": "Weight" + }, { "Name": "NormalizeFeatures", "Type": { @@ -11942,103 +12008,169 @@ "Default": "Auto" }, { - "Name": "LossFunction", - "Type": { - "Kind": "Component", - "ComponentKind": "SDCARegressionLossFunction" - }, - "Desc": "Loss Function", + "Name": "L2Weight", + "Type": "Float", + "Desc": "L2 regularization weight", "Aliases": [ - "loss" + "l2" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": { - "Name": "SquaredLoss" + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 } }, { - "Name": "NumThreads", - "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", + "Name": "L1Weight", + "Type": "Float", + "Desc": "L1 regularization weight", "Aliases": [ - "nt", - "t", - "threads" + "l1" ], "Required": false, "SortOrder": 50.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 4 + } }, { - "Name": "ConvergenceTolerance", + "Name": "OptTol", "Type": "Float", - "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Desc": "Tolerance parameter for optimization convergence. Lower = slower, more accurate", "Aliases": [ - "tol" + "ot" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, + "Default": 1E-07, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.001, - 0.01, - 0.1, - 0.2 + 0.0001, + 1E-07 ] } }, { - "Name": "MaxIterations", + "Name": "MemorySize", "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Desc": "Memory size for L-BFGS. Lower=faster, less accurate", "Aliases": [ - "iter" + "m" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 20, "SweepRange": { "RangeType": "Discrete", "Values": [ - "", - 10, + 5, 20, - 100 + 50 ] } }, { - "Name": "Shuffle", + "Name": "EnforceNonNegativity", "Type": "Bool", - "Desc": "Shuffle data every epoch?", + "Desc": "Enforce non-negative weights", "Aliases": [ - "shuf" + "nn" + ], + "Required": false, + "SortOrder": 90.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "InitWtsDiameter", + "Type": "Float", + "Desc": "Init weights diameter", + "Aliases": [ + "initwts" + ], + "Required": false, + "SortOrder": 140.0, + "IsNullable": false, + "Default": 0.0, + "SweepRange": { + "RangeType": "Float", + "Min": 0.0, + "Max": 1.0, + "NumSteps": 5 + } + }, + { + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum iterations.", + "Aliases": [ + "maxiter" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true, + "Default": 2147483647, "SweepRange": { - "RangeType": "Discrete", - "Values": [ - false, - true - ] + "RangeType": "Long", + "Min": 1, + "Max": 2147483647 } }, { - "Name": "CheckFrequency", + "Name": "SgdInitializationTolerance", + "Type": "Float", + "Desc": "Run SGD to initialize LR weights, converging to this tolerance", + "Aliases": [ + "sgd" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + }, + { + "Name": "Quiet", + "Type": "Bool", + "Desc": "If set to true, produce no output during training.", + "Aliases": [ + "q" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "UseThreads", + "Type": "Bool", + "Desc": "Whether or not to use threads. Default is true", + "Aliases": [ + "t" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "NumThreads", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Desc": "Number of threads", "Aliases": [ - "checkFreq" + "nt" ], "Required": false, "SortOrder": 150.0, @@ -12046,23 +12178,21 @@ "Default": null }, { - "Name": "BiasLearningRate", - "Type": "Float", - "Desc": "The learning rate for adjusting bias from being regularized.", + "Name": "DenseOptimizer", + "Type": "Bool", + "Desc": "Force densification of the internal optimization vectors", "Aliases": [ - "blr" + "do" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0, + "Default": false, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.0, - 0.01, - 0.1, - 1.0 + false, + true ] } } @@ -12075,6 +12205,7 @@ } ], "InputKind": [ + "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -12084,11 +12215,35 @@ ] }, { - "Name": "Trainers.StochasticGradientDescentBinaryClassifier", - "Desc": "Train an Hogwild SGD binary model.", - "FriendlyName": "Hogwild SGD (binary)", - "ShortName": "HogwildSGD", + "Name": "Trainers.StochasticDualCoordinateAscentBinaryClassifier", + "Desc": "Train an SDCA binary model.", + "FriendlyName": "Fast Linear (SA-SDCA)", + "ShortName": "SDCA", "Inputs": [ + { + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } + }, { "Name": "TrainingData", "Type": "DataView", @@ -12100,6 +12255,29 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", + "Aliases": [ + "l1" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 + ] + } + }, { "Name": "FeatureColumn", "Type": "String", @@ -12124,18 +12302,6 @@ "IsNullable": false, "Default": "Label" }, - { - "Name": "WeightColumn", - "Type": "String", - "Desc": "Column to use for example weight", - "Aliases": [ - "weight" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": "Weight" - }, { "Name": "NormalizeFeatures", "Type": { @@ -12180,7 +12346,7 @@ "Name": "LossFunction", "Type": { "Kind": "Component", - "ComponentKind": "ClassificationLossFunction" + "ComponentKind": "SDCAClassificationLossFunction" }, "Desc": "Loss Function", "Aliases": [ @@ -12193,32 +12359,10 @@ "Name": "LogLoss" } }, - { - "Name": "L2Const", - "Type": "Float", - "Desc": "L2 regularizer constant", - "Aliases": [ - "l2" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 1E-06, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1E-07, - 5E-07, - 1E-06, - 5E-06, - 1E-05 - ] - } - }, { "Name": "NumThreads", "Type": "Int", - "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ "nt", "t", @@ -12229,61 +12373,83 @@ "IsNullable": true, "Default": null }, + { + "Name": "PositiveInstanceWeight", + "Type": "Float", + "Desc": "Apply weight to the positive class, for imbalanced data", + "Aliases": [ + "piw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 + }, { "Name": "ConvergenceTolerance", "Type": "Float", - "Desc": "Exponential moving averaged improvement tolerance for convergence", + "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", "Aliases": [ "tol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0.0001, + "Default": 0.1, "SweepRange": { "RangeType": "Discrete", "Values": [ - 0.01, 0.001, - 0.0001, - 1E-05 + 0.01, + 0.1, + 0.2 ] } }, { "Name": "MaxIterations", "Type": "Int", - "Desc": "Maximum number of iterations; set to 1 to simulate online learning.", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", "Aliases": [ "iter" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 20, + "IsNullable": true, + "Default": null, "SweepRange": { "RangeType": "Discrete", "Values": [ - 1, - 5, + "", 10, - 20 + 20, + 100 ] } }, - { - "Name": "InitLearningRate", - "Type": "Float", - "Desc": "Initial learning rate (only used by SGD)", - "Aliases": [ - "ilr", - "lr" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.01 - }, { "Name": "Shuffle", "Type": "Bool", @@ -12303,22 +12469,10 @@ ] } }, - { - "Name": "PositiveInstanceWeight", - "Type": "Float", - "Desc": "Apply weight to the positive class, for imbalanced data", - "Aliases": [ - "piw" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 - }, { "Name": "CheckFrequency", "Type": "Int", - "Desc": "Convergence check frequency (in terms of number of iterations). Default equals number of threads", + "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ "checkFreq" ], @@ -12328,27 +12482,25 @@ "Default": null }, { - "Name": "Calibrator", - "Type": { - "Kind": "Component", - "ComponentKind": "CalibratorTrainer" - }, - "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Name": "BiasLearningRate", + "Type": "Float", + "Desc": "The learning rate for adjusting bias from being regularized.", + "Aliases": [ + "blr" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": { - "Name": "PlattCalibrator" + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 0.01, + 0.1, + 1.0 + ] } - }, - { - "Name": "MaxCalibrationExamples", - "Type": "Int", - "Desc": "The maximum number of examples to use when training the calibrator", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000 } ], "Outputs": [ @@ -12359,7 +12511,6 @@ } ], "InputKind": [ - "ITrainerInputWithWeight", "ITrainerInputWithLabel", "ITrainerInput" ], @@ -12369,962 +12520,836 @@ ] }, { - "Name": "Transforms.ApproximateBootstrapSampler", - "Desc": "Approximate bootstrap sampling.", - "FriendlyName": "Bootstrap Sample Transform", - "ShortName": "BootstrapSample", + "Name": "Trainers.StochasticDualCoordinateAscentClassifier", + "Desc": "Train an SDCA multi class model", + "FriendlyName": "Fast Linear Multi-class Classification (SA-SDCA)", + "ShortName": "sasdcamc", "Inputs": [ { - "Name": "Data", + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", + "Aliases": [ + "l2" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } + }, + { + "Name": "TrainingData", "Type": "DataView", - "Desc": "Input dataset", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Complement", - "Type": "Bool", - "Desc": "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.", + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", "Aliases": [ - "comp" + "l1" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 + ] + } }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed. If unspecified random state will be instead derived from the environment.", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" }, { - "Name": "ShuffleInput", - "Type": "Bool", - "Desc": "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "si" + "lab" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": true + "Default": "Label" }, { - "Name": "PoolSize", - "Type": "Int", - "Desc": "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.", + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", "Aliases": [ - "pool" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": 1000 - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Default": "Auto" }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.BinaryPredictionScoreColumnsRenamer", - "Desc": "For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.", - "FriendlyName": "Rename Binary Prediction Score Columns", - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The predictor model used in scoring", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - } - ], - "Outputs": [ + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "SDCAClassificationLossFunction" + }, + "Desc": "Loss Function", + "Aliases": [ + "loss" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": { + "Name": "LogLoss" + } + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.BinNormalizer", - "Desc": "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.", - "FriendlyName": "Binning Normalizer", - "ShortName": "Bin", - "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "NumBins", - "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", - "Aliases": [ - "bins" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ - "col" + "nt", + "t", + "threads" ], "Required": false, - "SortOrder": 1.0, - "IsNullable": false, + "SortOrder": 50.0, + "IsNullable": true, "Default": null }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Aliases": [ + "tol" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.1, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.001, + 0.01, + 0.1, + 0.2 + ] + } }, { - "Name": "NumBins", + "Name": "MaxIterations", "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", "Aliases": [ - "bins" + "iter" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": 1024 + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 10, + 20, + 100 + ] + } }, { - "Name": "FixZero", + "Name": "Shuffle", "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Desc": "Shuffle data every epoch?", "Aliases": [ - "zero" + "shuf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "MaxTrainingExamples", + "Name": "CheckFrequency", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", "Aliases": [ - "maxtrain" + "checkFreq" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "BiasLearningRate", + "Type": "Float", + "Desc": "The learning rate for adjusting bias from being regularized.", + "Aliases": [ + "blr" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000000 + "Default": 0.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 0.01, + 0.1, + 1.0 + ] + } } ], "Outputs": [ { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" } ], "InputKind": [ - "ITransformInput" + "ITrainerInputWithLabel", + "ITrainerInput" ], "OutputKind": [ - "ITransformOutput" + "IMulticlassClassificationOutput", + "ITrainerOutput" ] }, { - "Name": "Transforms.CategoricalHashOneHotVectorizer", - "Desc": "Encodes the categorical variable with hash-based encoding", - "FriendlyName": "Categorical Hash Transform", - "ShortName": null, + "Name": "Trainers.StochasticDualCoordinateAscentRegressor", + "Desc": "Train an SDCA regression model", + "FriendlyName": "Fast Linear Regression (SA-SDCA)", + "ShortName": "sasdcar", "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "OutputKind", - "Type": { - "Kind": "Enum", - "Values": [ - "Bag", - "Ind", - "Key", - "Bin" - ] - }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 102.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "HashBits", - "Type": "Int", - "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.", - "Aliases": [ - "bits" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", - "Aliases": [ - "ord" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "InvertHash", - "Type": "Int", - "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", - "Aliases": [ - "ih" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:hashBits:src)", + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant. By default the l2 constant is automatically inferred based on data set.", "Aliases": [ - "col" + "l2" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 1E-07, + 1E-06, + 1E-05, + 0.0001, + 0.001, + 0.01 + ] + } }, { - "Name": "Data", + "Name": "TrainingData", "Type": "DataView", - "Desc": "Input dataset", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "HashBits", - "Type": "Int", - "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.", + "Name": "L1Threshold", + "Type": "Float", + "Desc": "L1 soft threshold (L1/L2). Note that it is easier to control and sweep using the threshold parameter than the raw L1-regularizer constant. By default the l1 threshold is automatically inferred based on data set.", "Aliases": [ - "bits" + "l1" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 0.0, + 0.25, + 0.5, + 0.75, + 1.0 + ] + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" ], "Required": false, "SortOrder": 2.0, "IsNullable": false, - "Default": 16 + "Default": "Features" }, { - "Name": "OutputKind", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "NormalizeFeatures", "Type": { "Kind": "Enum", "Values": [ - "Bag", - "Ind", - "Key", - "Bin" + "No", + "Warn", + "Auto", + "Yes" ] }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Desc": "Normalize option for the feature column", "Aliases": [ - "kind" + "norm" ], "Required": false, - "SortOrder": 102.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": "Bag" + "Default": "Auto" }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": 314489979 + "Default": "Auto" }, { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", + "Name": "LossFunction", + "Type": { + "Kind": "Component", + "ComponentKind": "SDCARegressionLossFunction" + }, + "Desc": "Loss Function", "Aliases": [ - "ord" + "loss" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": true + "Default": { + "Name": "SquaredLoss" + } }, { - "Name": "InvertHash", + "Name": "NumThreads", "Type": "Int", - "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", + "Desc": "Degree of lock-free parallelism. Defaults to automatic. Determinism not guaranteed.", "Aliases": [ - "ih" + "nt", + "t", + "threads" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "The tolerance for the ratio between duality gap and primal loss for convergence checking.", + "Aliases": [ + "tol" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 - } - ], - "Outputs": [ + "Default": 0.01, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.001, + 0.01, + 0.1, + 0.2 + ] + } + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning. Defaults to automatic.", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + "", + 10, + 20, + 100 + ] + } }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Shuffle data every epoch?", + "Aliases": [ + "shuf" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + }, + { + "Name": "CheckFrequency", + "Type": "Int", + "Desc": "Convergence check frequency (in terms of number of iterations). Set as negative or zero for not checking at all. If left blank, it defaults to check after every 'numThreads' iterations.", + "Aliases": [ + "checkFreq" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "BiasLearningRate", + "Type": "Float", + "Desc": "The learning rate for adjusting bias from being regularized.", + "Aliases": [ + "blr" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.0, + 0.01, + 0.1, + 1.0 + ] + } + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IRegressionOutput", + "ITrainerOutput" ] }, { - "Name": "Transforms.CategoricalOneHotVectorizer", - "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary", - "FriendlyName": "Categorical Transform", - "ShortName": null, + "Name": "Trainers.StochasticGradientDescentBinaryClassifier", + "Desc": "Train an Hogwild SGD binary model.", + "FriendlyName": "Hogwild SGD (binary)", + "ShortName": "HogwildSGD", "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "OutputKind", - "Type": { - "Kind": "Enum", - "Values": [ - "Bag", - "Ind", - "Key", - "Bin" - ] - }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep when auto-training", - "Aliases": [ - "max" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", - "Aliases": [ - "textkv" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", "Aliases": [ - "col" + "data" ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", "Aliases": [ - "max" + "feat" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": 1000000 + "Default": "Features" }, { - "Name": "OutputKind", - "Type": { - "Kind": "Enum", - "Values": [ - "Bag", - "Ind", - "Key", - "Bin" - ] - }, - "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", "Aliases": [ - "kind" + "lab" ], "Required": false, - "SortOrder": 102.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": "Ind" + "Default": "Label" }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], "Required": false, - "SortOrder": 106.0, + "SortOrder": 4.0, "IsNullable": false, - "Default": null + "Default": "Weight" }, { - "Name": "Sort", + "Name": "NormalizeFeatures", "Type": { "Kind": "Enum", "Values": [ - "Occurrence", - "Value" + "No", + "Warn", + "Auto", + "Yes" ] }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], "Required": false, - "SortOrder": 113.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": "Occurrence" + "Default": "Auto" }, { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", "Aliases": [ - "textkv" + "cache" ], "Required": false, - "SortOrder": 114.0, + "SortOrder": 6.0, "IsNullable": false, - "Default": true - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Default": "Auto" }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.CharacterTokenizer", - "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.", - "FriendlyName": "Character Tokenizer Transform", - "ShortName": "CharToken", - "Inputs": [ - { - "Name": "Column", + "Name": "LossFunction", "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "Kind": "Component", + "ComponentKind": "ClassificationLossFunction" }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Loss Function", "Aliases": [ - "col" + "loss" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": { + "Name": "LogLoss" + } }, { - "Name": "UseMarkerChars", - "Type": "Bool", - "Desc": "Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03)", + "Name": "L2Const", + "Type": "Float", + "Desc": "L2 regularizer constant", "Aliases": [ - "mark" + "l2" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": true - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Default": 1E-06, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1E-07, + 5E-07, + 1E-06, + 5E-06, + 1E-05 + ] + } }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.ColumnConcatenator", - "Desc": "Concatenates two columns of the same item type.", - "FriendlyName": "Concat Transform", - "ShortName": "Concat", - "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:srcs)", + "Name": "NumThreads", + "Type": "Int", + "Desc": "Degree of lock-free parallelism. Defaults to automatic depending on data sparseness. Determinism not guaranteed.", "Aliases": [ - "col" + "nt", + "t", + "threads" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ + "Name": "ConvergenceTolerance", + "Type": "Float", + "Desc": "Exponential moving averaged improvement tolerance for convergence", + "Aliases": [ + "tol" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0001, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.001, + 0.0001, + 1E-05 + ] + } + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MaxIterations", + "Type": "Int", + "Desc": "Maximum number of iterations; set to 1 to simulate online learning.", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 20, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 5, + 10, + 20 + ] + } }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.ColumnCopier", - "Desc": "Duplicates columns from the dataset", - "FriendlyName": "Copy Columns Transform", - "ShortName": "Copy", - "Inputs": [ + "Name": "InitLearningRate", + "Type": "Float", + "Desc": "Initial learning rate (only used by SGD)", + "Aliases": [ + "ilr", + "lr" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01 + }, { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Shuffle data every epoch?", "Aliases": [ - "col" + "shuf" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "PositiveInstanceWeight", + "Type": "Float", + "Desc": "Apply weight to the positive class, for imbalanced data", + "Aliases": [ + "piw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "CheckFrequency", + "Type": "Int", + "Desc": "Convergence check frequency (in terms of number of iterations). Default equals number of threads", + "Aliases": [ + "checkFreq" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 } ], "Outputs": [ { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" } ], "InputKind": [ - "ITransformInput" + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" ], "OutputKind": [ - "ITransformOutput" + "IBinaryClassificationOutput", + "ITrainerOutput" ] }, { - "Name": "Transforms.ColumnDropper", - "Desc": "Drops columns from the dataset", - "FriendlyName": "Drop Columns Transform", - "ShortName": "Drop", + "Name": "Transforms.ApproximateBootstrapSampler", + "Desc": "Approximate bootstrap sampling.", + "FriendlyName": "Bootstrap Sample Transform", + "ShortName": "BootstrapSample", "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Column name to drop", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, { "Name": "Data", "Type": "DataView", @@ -13332,6 +13357,51 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "Complement", + "Type": "Bool", + "Desc": "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.", + "Aliases": [ + "comp" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed. If unspecified random state will be instead derived from the environment.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ShuffleInput", + "Type": "Bool", + "Desc": "Whether we should attempt to shuffle the source data. By default on, but can be turned off for efficiency.", + "Aliases": [ + "si" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "PoolSize", + "Type": "Int", + "Desc": "When shuffling the output, the number of output rows to keep in that pool. Note that shuffling of output is completely distinct from shuffling of input.", + "Aliases": [ + "pool" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000 } ], "Outputs": [ @@ -13354,26 +13424,11 @@ ] }, { - "Name": "Transforms.ColumnSelector", - "Desc": "Selects a set of columns, dropping all others", - "FriendlyName": "Select Columns", + "Name": "Transforms.BinaryPredictionScoreColumnsRenamer", + "Desc": "For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class.", + "FriendlyName": "Rename Binary Prediction Score Columns", "ShortName": null, "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Column name to keep", - "Aliases": [ - "col" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, { "Name": "Data", "Type": "DataView", @@ -13381,6 +13436,14 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model used in scoring", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false } ], "Outputs": [ @@ -13403,10 +13466,10 @@ ] }, { - "Name": "Transforms.ColumnTypeConverter", - "Desc": "Converts a column to a different type, using standard conversions.", - "FriendlyName": "Convert Transform", - "ShortName": "Convert", + "Name": "Transforms.BinNormalizer", + "Desc": "The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins.", + "FriendlyName": "Binning Normalizer", + "ShortName": "Bin", "Inputs": [ { "Name": "Column", @@ -13416,39 +13479,11 @@ "Kind": "Struct", "Fields": [ { - "Name": "ResultType", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "The result type", + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", "Aliases": [ - "type" + "bins" ], "Required": false, "SortOrder": 150.0, @@ -13456,15 +13491,27 @@ "Default": null }, { - "Name": "Range", - "Type": "String", - "Desc": "For a key column, this defines the range of values", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "key" + "zero" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, "Default": null }, { @@ -13494,13 +13541,14 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:type:src)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -13511,119 +13559,40 @@ "IsNullable": false }, { - "Name": "ResultType", - "Type": { - "Kind": "Enum", - "Values": [ - "I1", - "U1", - "I2", - "U2", - "I4", - "U4", - "I8", - "U8", - "R4", - "Num", - "R8", - "TX", - "Text", - "TXT", - "BL", - "Bool", - "TimeSpan", - "TS", - "DT", - "DateTime", - "DZ", - "DateTimeZone", - "UG", - "U16" - ] - }, - "Desc": "The result type", + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", "Aliases": [ - "type" + "bins" ], "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1024 }, { - "Name": "Range", - "Type": "String", - "Desc": "For a key column, this defines the range of values", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "key" + "zero" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": null - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Default": true }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.CombinerByContiguousGroupId", - "Desc": "Groups values of a scalar column into a vector, by a contiguous group ID", - "FriendlyName": "Group Transform", - "ShortName": "Group", - "Inputs": [ - { - "Name": "GroupKey", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Columns to group by", + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "g" + "maxtrain" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": null - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Columns to group together", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 2.0, - "IsNullable": false + "Default": 1000000000 } ], "Outputs": [ @@ -13646,9 +13615,9 @@ ] }, { - "Name": "Transforms.ConditionalNormalizer", - "Desc": "Normalize the columns only if needed", - "FriendlyName": "Normalize If Needed", + "Name": "Transforms.CategoricalHashOneHotVectorizer", + "Desc": "Encodes the categorical variable with hash-based encoding", + "FriendlyName": "Categorical Hash Transform", "ShortName": null, "Inputs": [ { @@ -13659,11 +13628,52 @@ "Kind": "Struct", "Fields": [ { - "Name": "FixZero", + "Name": "OutputKind", + "Type": { + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] + }, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 102.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "HashBits", + "Type": "Int", + "Desc": "The number of bits to hash into. Must be between 1 and 30, inclusive.", + "Aliases": [ + "bits" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Ordered", "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Desc": "Whether the position of each term should be included in the hash", "Aliases": [ - "zero" + "ord" ], "Required": false, "SortOrder": 150.0, @@ -13671,11 +13681,11 @@ "Default": null }, { - "Name": "MaxTrainingExamples", + "Name": "InvertHash", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", "Aliases": [ - "maxtrain" + "ih" ], "Required": false, "SortOrder": 150.0, @@ -13709,7 +13719,7 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:hashBits:src)", "Aliases": [ "col" ], @@ -13726,173 +13736,95 @@ "IsNullable": false }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Name": "HashBits", + "Type": "Int", + "Desc": "Number of bits to hash into. Must be between 1 and 30, inclusive.", "Aliases": [ - "zero" + "bits" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": true + "Default": 16 }, { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000000 - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ] - }, - { - "Name": "Transforms.DataCache", - "Desc": "Caches using the specified cache option.", - "FriendlyName": "Cache Data", - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Caching", + "Name": "OutputKind", "Type": { "Kind": "Enum", "Values": [ - "Memory", - "Disk" + "Bag", + "Ind", + "Key", + "Bin" ] }, - "Desc": "Caching strategy", - "Required": true, - "SortOrder": 2.0, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 102.0, "IsNullable": false, - "Default": "Memory" - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Dataset" - } - ], - "InputKind": [ - "ITransformInput" - ] - }, - { - "Name": "Transforms.DatasetScorer", - "Desc": "Score a dataset with a predictor model", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "The dataset to be scored", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The predictor model to apply to data", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false + "Default": "Bag" }, { - "Name": "Suffix", - "Type": "String", - "Desc": "Suffix to append to the score columns", + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", "Required": false, - "SortOrder": 3.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": null - } - ], - "Outputs": [ - { - "Name": "ScoredData", - "Type": "DataView", - "Desc": "The scored dataset" + "Default": 314489979 }, { - "Name": "ScoringTransform", - "Type": "TransformModel", - "Desc": "The scoring transform" - } - ] - }, - { - "Name": "Transforms.DatasetTransformScorer", - "Desc": "Score a dataset with a transform model", - "FriendlyName": null, - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "The dataset to be scored", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "Ordered", + "Type": "Bool", + "Desc": "Whether the position of each term should be included in the hash", + "Aliases": [ + "ord" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true }, { - "Name": "TransformModel", - "Type": "TransformModel", - "Desc": "The transform model to apply to data", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false + "Name": "InvertHash", + "Type": "Int", + "Desc": "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", + "Aliases": [ + "ih" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 } ], "Outputs": [ { - "Name": "ScoredData", + "Name": "OutputData", "Type": "DataView", - "Desc": "The scored dataset" + "Desc": "Transformed dataset" }, { - "Name": "ScoringTransform", + "Name": "Model", "Type": "TransformModel", - "Desc": "The scoring transform" + "Desc": "Transform model" } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" ] }, { - "Name": "Transforms.Dictionarizer", - "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", - "FriendlyName": "Term Transform", - "ShortName": "TermTransform", + "Name": "Transforms.CategoricalOneHotVectorizer", + "Desc": "Encodes the categorical variable with one-hot encoding based on term dictionary", + "FriendlyName": "Categorical Transform", + "ShortName": null, "Inputs": [ { "Name": "Column", @@ -13901,6 +13833,26 @@ "ItemType": { "Kind": "Struct", "Fields": [ + { + "Name": "OutputKind", + "Type": { + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] + }, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "MaxNumTerms", "Type": "Int", @@ -13983,10 +13935,9 @@ "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -14008,6 +13959,26 @@ "IsNullable": false, "Default": 1000000 }, + { + "Name": "OutputKind", + "Type": { + "Kind": "Enum", + "Values": [ + "Bag", + "Ind", + "Key", + "Bin" + ] + }, + "Desc": "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 102.0, + "IsNullable": false, + "Default": "Ind" + }, { "Name": "Term", "Type": { @@ -14045,7 +14016,7 @@ "Required": false, "SortOrder": 114.0, "IsNullable": false, - "Default": false + "Default": true } ], "Outputs": [ @@ -14068,64 +14039,46 @@ ] }, { - "Name": "Transforms.FeatureCombiner", - "Desc": "Combines all the features into one feature column.", - "FriendlyName": "Feature Combiner", - "ShortName": "fc", + "Name": "Transforms.CharacterTokenizer", + "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.", + "FriendlyName": "Character Tokenizer Transform", + "ShortName": "CharToken", "Inputs": [ { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Features", + "Name": "Column", "Type": { "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Features", - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": null - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.FeatureSelectorByCount", - "Desc": "Selects the slots for which the count of non-default values is greater than or equal to a threshold.", - "FriendlyName": "Count Feature Selection Transform", - "ShortName": null, - "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } }, - "Desc": "Columns to use for feature selection", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], @@ -14133,18 +14086,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "Count", - "Type": "Int", - "Desc": "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved", - "Aliases": [ - "c" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 1 - }, { "Name": "Data", "Type": "DataView", @@ -14152,6 +14093,18 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "UseMarkerChars", + "Type": "Bool", + "Desc": "Whether to mark the beginning/end of each row/slot with start of text character (0x02)/end of text character (0x03)", + "Aliases": [ + "mark" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -14174,18 +14127,49 @@ ] }, { - "Name": "Transforms.FeatureSelectorByMutualInformation", - "Desc": "Selects the top k slots across all specified columns ordered by their mutual information with the label column.", - "FriendlyName": "Mutual Information Feature Selection Transform", - "ShortName": "MIFeatureSelection", + "Name": "Transforms.ColumnConcatenator", + "Desc": "Concatenates two columns of the same item type.", + "FriendlyName": "Concat Transform", + "ShortName": "Concat", "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": "String" + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } }, - "Desc": "Columns to use for feature selection", + "Desc": "New column definition(s) (optional form: name:srcs)", "Aliases": [ "col" ], @@ -14193,19 +14177,6 @@ "SortOrder": 1.0, "IsNullable": false }, - { - "Name": "SlotsInOutput", - "Type": "Int", - "Desc": "The maximum number of slots to preserve in output", - "Aliases": [ - "topk", - "numSlotsToKeep" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": 1000 - }, { "Name": "Data", "Type": "DataView", @@ -14213,30 +14184,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", - "Aliases": [ - "lab" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": "Label" - }, - { - "Name": "NumBins", - "Type": "Int", - "Desc": "Max number of bins for R4/R8 columns, power of 2 recommended", - "Aliases": [ - "bins" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 256 } ], "Outputs": [ @@ -14259,10 +14206,10 @@ ] }, { - "Name": "Transforms.GlobalContrastNormalizer", - "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.", - "FriendlyName": "Global Contrast Normalization Transform", - "ShortName": "Gcn", + "Name": "Transforms.ColumnCopier", + "Desc": "Duplicates columns from the dataset", + "FriendlyName": "Copy Columns Transform", + "ShortName": "Copy", "Inputs": [ { "Name": "Column", @@ -14271,33 +14218,6 @@ "ItemType": { "Kind": "Struct", "Fields": [ - { - "Name": "UseStdDev", - "Type": "Bool", - "Desc": "Normalize by standard deviation rather than L2 norm", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Scale", - "Type": "Float", - "Desc": "Scale features by this value", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, { "Name": "Name", "Type": "String", @@ -14329,19 +14249,9 @@ "Aliases": [ "col" ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": true + "IsNullable": false }, { "Name": "Data", @@ -14350,27 +14260,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "UseStdDev", - "Type": "Bool", - "Desc": "Normalize by standard deviation rather than L2 norm", - "Aliases": [ - "useStd" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "Scale", - "Type": "Float", - "Desc": "Scale features by this value", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1.0 } ], "Outputs": [ @@ -14393,97 +14282,18 @@ ] }, { - "Name": "Transforms.HashConverter", - "Desc": "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.", - "FriendlyName": "Hash Join Transform", - "ShortName": "HashJoin", + "Name": "Transforms.ColumnDropper", + "Desc": "Drops columns from the dataset", + "FriendlyName": "Drop Columns Transform", + "ShortName": "Drop", "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Join", - "Type": "Bool", - "Desc": "Whether the values need to be combined for a single hash", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "CustomSlotMap", - "Type": "String", - "Desc": "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "HashBits", - "Type": "Int", - "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", - "Aliases": [ - "bits" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", - "Aliases": [ - "ord" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "ItemType": "String" }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Column name to drop", "Aliases": [ "col" ], @@ -14498,48 +14308,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "HashBits", - "Type": "Int", - "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", - "Aliases": [ - "bits" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": 31 - }, - { - "Name": "Join", - "Type": "Bool", - "Desc": "Whether the values need to be combined for a single hash", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "Hashing seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 314489979 - }, - { - "Name": "Ordered", - "Type": "Bool", - "Desc": "Whether the position of each term should be included in the hash", - "Aliases": [ - "ord" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true } ], "Outputs": [ @@ -14562,87 +14330,26 @@ ] }, { - "Name": "Transforms.KeyToTextConverter", - "Desc": "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", - "FriendlyName": "Key To Value Transform", + "Name": "Transforms.ColumnSelector", + "Desc": "Selects a set of columns, dropping all others", + "FriendlyName": "Select Columns", "ShortName": null, "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "ItemType": "String" }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Column name to keep", "Aliases": [ "col" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "IsNullable": false, + "Default": null }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.LabelColumnKeyBooleanConverter", - "Desc": "Transforms the label to either key or bool (if needed) to make it suitable for classification.", - "FriendlyName": "Prepare Classification Label", - "ShortName": null, - "Inputs": [ { "Name": "Data", "Type": "DataView", @@ -14650,23 +14357,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "The label column", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false - }, - { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Convert the key values to text", - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": true } ], "Outputs": [ @@ -14689,10 +14379,10 @@ ] }, { - "Name": "Transforms.LabelIndicator", - "Desc": "Label remapper used by OVA", - "FriendlyName": "LabelIndicator", - "ShortName": "LabelIndictator", + "Name": "Transforms.ColumnTypeConverter", + "Desc": "Converts a column to a different type, using standard conversions.", + "FriendlyName": "Convert Transform", + "ShortName": "Convert", "Inputs": [ { "Name": "Column", @@ -14702,17 +14392,57 @@ "Kind": "Struct", "Fields": [ { - "Name": "ClassIndex", - "Type": "Int", - "Desc": "The positive example class for binary classification.", + "Name": "ResultType", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "The result type", "Aliases": [ - "index" + "type" ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, + { + "Name": "Range", + "Type": "String", + "Desc": "For a key column, this defines the range of values", + "Aliases": [ + "key" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, { "Name": "Name", "Type": "String", @@ -14740,14 +14470,13 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:type:src)", "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -14758,16 +14487,56 @@ "IsNullable": false }, { - "Name": "ClassIndex", - "Type": "Int", - "Desc": "Label of the positive class.", + "Name": "ResultType", + "Type": { + "Kind": "Enum", + "Values": [ + "I1", + "U1", + "I2", + "U2", + "I4", + "U4", + "I8", + "U8", + "R4", + "Num", + "R8", + "TX", + "Text", + "TXT", + "BL", + "Bool", + "TimeSpan", + "TS", + "DT", + "DateTime", + "DZ", + "DateTimeZone", + "UG", + "U16" + ] + }, + "Desc": "The result type", "Aliases": [ - "index" + "type" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Range", + "Type": "String", + "Desc": "For a key column, this defines the range of values", + "Aliases": [ + "key" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": null } ], "Outputs": [ @@ -14790,11 +14559,26 @@ ] }, { - "Name": "Transforms.LabelToFloatConverter", - "Desc": "Transforms the label to float to make it suitable for regression.", - "FriendlyName": "Prepare Regression Label", - "ShortName": null, + "Name": "Transforms.CombinerByContiguousGroupId", + "Desc": "Groups values of a scalar column into a vector, by a contiguous group ID", + "FriendlyName": "Group Transform", + "ShortName": "Group", "Inputs": [ + { + "Name": "GroupKey", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to group by", + "Aliases": [ + "g" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, { "Name": "Data", "Type": "DataView", @@ -14804,9 +14588,15 @@ "IsNullable": false }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "The label column", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to group together", + "Aliases": [ + "col" + ], "Required": true, "SortOrder": 2.0, "IsNullable": false @@ -14832,19 +14622,11 @@ ] }, { - "Name": "Transforms.LightLda", - "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.", - "FriendlyName": "Latent Dirichlet Allocation Transform", - "ShortName": "LightLda", + "Name": "Transforms.ConditionalNormalizer", + "Desc": "Normalize the columns only if needed", + "FriendlyName": "Normalize If Needed", + "ShortName": null, "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, { "Name": "Column", "Type": { @@ -14853,95 +14635,11 @@ "Kind": "Struct", "Fields": [ { - "Name": "NumTopic", - "Type": "Int", - "Desc": "The number of topics in the LDA", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "AlphaSum", - "Type": "Float", - "Desc": "Dirichlet prior on document-topic vectors", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Beta", - "Type": "Float", - "Desc": "Dirichlet prior on vocab-topic vectors", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Mhstep", - "Type": "Int", - "Desc": "Number of Metropolis Hasting step", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Number of iterations", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "LikelihoodInterval", - "Type": "Int", - "Desc": "Compute log likelihood over local dataset on this iteration interval", - "Aliases": [ - "llInterval" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "NumThreads", - "Type": "Int", - "Desc": "The number of training threads", - "Aliases": [ - "t" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "NumMaxDocToken", - "Type": "Int", - "Desc": "The threshold of maximum count of tokens per doc", - "Aliases": [ - "maxNumToken" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "NumSummaryTermPerTopic", - "Type": "Int", - "Desc": "The number of words to summarize the topic", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "ns" + "zero" ], "Required": false, "SortOrder": 150.0, @@ -14949,23 +14647,11 @@ "Default": null }, { - "Name": "NumBurninIterations", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "The number of burn-in iterations", - "Aliases": [ - "burninIter" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": 10 - }, - { - "Name": "ResetRandomGenerator", - "Type": "Bool", - "Desc": "Reset the random number generator for each document", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "reset" + "maxtrain" ], "Required": false, "SortOrder": 150.0, @@ -14999,199 +14685,45 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:srcs)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], "Required": true, - "SortOrder": 49.0, + "SortOrder": 1.0, "IsNullable": false }, { - "Name": "NumTopic", - "Type": "Int", - "Desc": "The number of topics in the LDA", - "Required": false, - "SortOrder": 50.0, - "IsNullable": false, - "Default": 100, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 20, - 40, - 100, - 200 - ] - } + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "NumMaxDocToken", - "Type": "Int", - "Desc": "The threshold of maximum count of tokens per doc", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", "Aliases": [ - "maxNumToken" + "zero" ], "Required": false, - "SortOrder": 50.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 512 + "Default": true }, { - "Name": "NumThreads", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "The number of training threads. Default value depends on number of logical processors.", - "Aliases": [ - "t" - ], - "Required": false, - "SortOrder": 50.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "AlphaSum", - "Type": "Float", - "Desc": "Dirichlet prior on document-topic vectors", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 100.0, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 1, - 10, - 100, - 200 - ] - } - }, - { - "Name": "Beta", - "Type": "Float", - "Desc": "Dirichlet prior on vocab-topic vectors", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0.01, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 0.01, - 0.015, - 0.07, - 0.02 - ] - } - }, - { - "Name": "Mhstep", - "Type": "Int", - "Desc": "Number of Metropolis Hasting step", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 4, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 2, - 4, - 8, - 16 - ] - } - }, - { - "Name": "NumIterations", - "Type": "Int", - "Desc": "Number of iterations", - "Aliases": [ - "iter" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 200, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 100, - 200, - 300, - 400 - ] - } - }, - { - "Name": "LikelihoodInterval", - "Type": "Int", - "Desc": "Compute log likelihood over local dataset on this iteration interval", - "Aliases": [ - "llInterval" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 5 - }, - { - "Name": "NumSummaryTermPerTopic", - "Type": "Int", - "Desc": "The number of words to summarize the topic", - "Aliases": [ - "ns" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 10 - }, - { - "Name": "NumBurninIterations", - "Type": "Int", - "Desc": "The number of burn-in iterations", - "Aliases": [ - "burninIter" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 10, - "SweepRange": { - "RangeType": "Discrete", - "Values": [ - 10, - 20, - 30, - 40 - ] - } - }, - { - "Name": "ResetRandomGenerator", - "Type": "Bool", - "Desc": "Reset the random number generator for each document", - "Aliases": [ - "reset" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": false - }, - { - "Name": "OutputTopicWordSummary", - "Type": "Bool", - "Desc": "Whether to output the topic-word summary in text format", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "summary" + "maxtrain" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": 1000000000 } ], "Outputs": [ @@ -15208,72 +14740,14 @@ ], "InputKind": [ "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.LogMeanVarianceNormalizer", - "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.", - "FriendlyName": "LogMeanVar Normalizer", - "ShortName": "LogMeanVar", + "Name": "Transforms.DataCache", + "Desc": "Caches using the specified cache option.", + "FriendlyName": "Cache Data", + "ShortName": null, "Inputs": [ - { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", - "Aliases": [ - "col" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, { "Name": "Data", "Type": "DataView", @@ -15283,204 +14757,95 @@ "IsNullable": false }, { - "Name": "UseCdf", - "Type": "Bool", - "Desc": "Whether to use CDF as the output", - "Aliases": [ - "cdf" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Memory", + "Disk" + ] + }, + "Desc": "Caching strategy", + "Required": true, + "SortOrder": 2.0, "IsNullable": false, - "Default": 1000000000 + "Default": "Memory" } ], "Outputs": [ { "Name": "OutputData", "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Desc": "Dataset" } ], "InputKind": [ "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.LpNormalizer", - "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.", - "FriendlyName": "Lp-Norm Normalizer", - "ShortName": "lpnorm", + "Name": "Transforms.DatasetScorer", + "Desc": "Score a dataset with a predictor model", + "FriendlyName": null, + "ShortName": null, "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "NormKind", - "Type": { - "Kind": "Enum", - "Values": [ - "L2Norm", - "StdDev", - "L1Norm", - "LInf" - ] - }, - "Desc": "The norm to use to normalize each sample", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s) (optional form: name:src)", - "Aliases": [ - "col" - ], + "Name": "Data", + "Type": "DataView", + "Desc": "The dataset to be scored", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "NormKind", - "Type": { - "Kind": "Enum", - "Values": [ - "L2Norm", - "StdDev", - "L1Norm", - "LInf" - ] - }, - "Desc": "The norm to use to normalize each sample", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": "L2Norm" - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model to apply to data", "Required": true, - "SortOrder": 1.0, + "SortOrder": 2.0, "IsNullable": false }, { - "Name": "SubMean", - "Type": "Bool", - "Desc": "Subtract mean from each value before normalizing", + "Name": "Suffix", + "Type": "String", + "Desc": "Suffix to append to the score columns", "Required": false, - "SortOrder": 2.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": false + "Default": null } ], "Outputs": [ { - "Name": "OutputData", + "Name": "ScoredData", "Type": "DataView", - "Desc": "Transformed dataset" + "Desc": "The scored dataset" }, { - "Name": "Model", + "Name": "ScoringTransform", "Type": "TransformModel", - "Desc": "Transform model" + "Desc": "The scoring transform" } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] }, { - "Name": "Transforms.ManyHeterogeneousModelCombiner", - "Desc": "Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.", + "Name": "Transforms.DatasetTransformScorer", + "Desc": "Score a dataset with a transform model", "FriendlyName": null, "ShortName": null, "Inputs": [ { - "Name": "TransformModels", - "Type": { - "Kind": "Array", - "ItemType": "TransformModel" - }, - "Desc": "Transform model", + "Name": "Data", + "Type": "DataView", + "Desc": "The dataset to be scored", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model", + "Name": "TransformModel", + "Type": "TransformModel", + "Desc": "The transform model to apply to data", "Required": true, "SortOrder": 2.0, "IsNullable": false @@ -15488,17 +14853,22 @@ ], "Outputs": [ { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model" + "Name": "ScoredData", + "Type": "DataView", + "Desc": "The scored dataset" + }, + { + "Name": "ScoringTransform", + "Type": "TransformModel", + "Desc": "The scoring transform" } ] }, { - "Name": "Transforms.MeanVarianceNormalizer", - "Desc": "Normalizes the data based on the computed mean and variance of the data.", - "FriendlyName": "MeanVar Normalizer", - "ShortName": "MeanVar", + "Name": "Transforms.Dictionarizer", + "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", + "FriendlyName": "Term Transform", + "ShortName": "TermTransform", "Inputs": [ { "Name": "Column", @@ -15508,11 +14878,11 @@ "Kind": "Struct", "Fields": [ { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep when auto-training", "Aliases": [ - "zero" + "max" ], "Required": false, "SortOrder": 150.0, @@ -15520,11 +14890,38 @@ "Default": null }, { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", "Aliases": [ - "maxtrain" + "textkv" ], "Required": false, "SortOrder": 150.0, @@ -15562,9 +14959,10 @@ "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -15575,40 +14973,55 @@ "IsNullable": false }, { - "Name": "UseCdf", - "Type": "Bool", - "Desc": "Whether to use CDF as the output", + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep per column when auto-training", "Aliases": [ - "cdf" + "max" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 5.0, "IsNullable": false, - "Default": false + "Default": 1000000 }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" - ], + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", "Required": false, - "SortOrder": 150.0, + "SortOrder": 106.0, "IsNullable": false, - "Default": true + "Default": null }, { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000000 + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 113.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 114.0, + "IsNullable": false, + "Default": false } ], "Outputs": [ @@ -15631,70 +15044,64 @@ ] }, { - "Name": "Transforms.MinMaxNormalizer", - "Desc": "Normalizes the data based on the observed minimum and maximum values of the data.", - "FriendlyName": "Min-Max Normalizer", - "ShortName": "MinMax", + "Name": "Transforms.FeatureCombiner", + "Desc": "Combines all the features into one feature column.", + "FriendlyName": "Feature Combiner", + "ShortName": "fc", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Features", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Features", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.FeatureSelectorByCount", + "Desc": "Selects the slots for which the count of non-default values is greater than or equal to a threshold.", + "FriendlyName": "Count Feature Selection Transform", + "ShortName": null, "Inputs": [ { "Name": "Column", "Type": { "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } + "ItemType": "String" }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "Columns to use for feature selection", "Aliases": [ "col" ], @@ -15702,6 +15109,18 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "Count", + "Type": "Int", + "Desc": "If the count of non-default values for a slot is greater than or equal to this threshold, the slot is preserved", + "Aliases": [ + "c" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 1 + }, { "Name": "Data", "Type": "DataView", @@ -15709,30 +15128,91 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.FeatureSelectorByMutualInformation", + "Desc": "Selects the top k slots across all specified columns ordered by their mutual information with the label column.", + "FriendlyName": "Mutual Information Feature Selection Transform", + "ShortName": "MIFeatureSelection", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to use for feature selection", "Aliases": [ - "zero" + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "SlotsInOutput", + "Type": "Int", + "Desc": "The maximum number of slots to preserve in output", + "Aliases": [ + "topk", + "numSlotsToKeep" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": true + "Default": 1000 }, { - "Name": "MaxTrainingExamples", + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "NumBins", "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", + "Desc": "Max number of bins for R4/R8 columns, power of 2 recommended", "Aliases": [ - "maxtrain" + "bins" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000000 + "Default": 256 } ], "Outputs": [ @@ -15755,10 +15235,10 @@ ] }, { - "Name": "Transforms.MissingValueHandler", - "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.", - "FriendlyName": "NA Handle Transform", - "ShortName": "NAHandle", + "Name": "Transforms.GlobalContrastNormalizer", + "Desc": "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation.", + "FriendlyName": "Global Contrast Normalization Transform", + "ShortName": "Gcn", "Inputs": [ { "Name": "Column", @@ -15768,41 +15248,27 @@ "Kind": "Struct", "Fields": [ { - "Name": "Kind", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultValue", - "Mean", - "Minimum", - "Maximum" - ] - }, - "Desc": "The replacement method to utilize", + "Name": "UseStdDev", + "Type": "Bool", + "Desc": "Normalize by standard deviation rather than L2 norm", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "ImputeBySlot", - "Type": "Bool", - "Desc": "Whether to impute values by slot", - "Aliases": [ - "slot" - ], + "Name": "Scale", + "Type": "Float", + "Desc": "Scale features by this value", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "ConcatIndicator", + "Name": "SubMean", "Type": "Bool", - "Desc": "Whether or not to concatenate an indicator vector column to the value column", - "Aliases": [ - "ind" - ], + "Desc": "Subtract mean from each value before normalizing", "Required": false, "SortOrder": 150.0, "IsNullable": true, @@ -15835,13 +15301,23 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:rep:src)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null + }, + { + "Name": "SubMean", + "Type": "Bool", + "Desc": "Subtract mean from each value before normalizing", + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": true }, { "Name": "Data", @@ -15852,48 +15328,25 @@ "IsNullable": false }, { - "Name": "ReplaceWith", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultValue", - "Mean", - "Minimum", - "Maximum" - ] - }, - "Desc": "The replacement method to utilize", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": "Def" - }, - { - "Name": "ImputeBySlot", + "Name": "UseStdDev", "Type": "Bool", - "Desc": "Whether to impute values by slot", + "Desc": "Normalize by standard deviation rather than L2 norm", "Aliases": [ - "slot" + "useStd" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": false }, { - "Name": "Concat", - "Type": "Bool", - "Desc": "Whether or not to concatenate an indicator vector column to the value column", - "Aliases": [ - "ind" - ], + "Name": "Scale", + "Type": "Float", + "Desc": "Scale features by this value", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 1.0 } ], "Outputs": [ @@ -15916,10 +15369,10 @@ ] }, { - "Name": "Transforms.MissingValueIndicator", - "Desc": "Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.", - "FriendlyName": "NA Indicator Transform", - "ShortName": "NAInd", + "Name": "Transforms.HashConverter", + "Desc": "Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. This is a part of the Dracula transform.", + "FriendlyName": "Hash Join Transform", + "ShortName": "HashJoin", "Inputs": [ { "Name": "Column", @@ -15928,6 +15381,57 @@ "ItemType": { "Kind": "Struct", "Fields": [ + { + "Name": "Join", + "Type": "Bool", + "Desc": "Whether the values need to be combined for a single hash", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "CustomSlotMap", + "Type": "String", + "Desc": "Which slots should be combined together. Example: 0,3,5;0,1;3;2,1,0. Overrides 'join'.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "HashBits", + "Type": "Int", + "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", + "Aliases": [ + "bits" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Ordered", + "Type": "Bool", + "Desc": "Whether the position of each term should be included in the hash", + "Aliases": [ + "ord" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, { "Name": "Name", "Type": "String", @@ -15970,6 +15474,48 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "HashBits", + "Type": "Int", + "Desc": "Number of bits to hash into. Must be between 1 and 31, inclusive.", + "Aliases": [ + "bits" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 31 + }, + { + "Name": "Join", + "Type": "Bool", + "Desc": "Whether the values need to be combined for a single hash", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "Hashing seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 314489979 + }, + { + "Name": "Ordered", + "Type": "Bool", + "Desc": "Whether the position of each term should be included in the hash", + "Aliases": [ + "ord" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -15992,10 +15538,10 @@ ] }, { - "Name": "Transforms.MissingValuesDropper", - "Desc": "Removes NAs from vector columns.", - "FriendlyName": "NA Drop Transform", - "ShortName": "NADrop", + "Name": "Transforms.KeyToTextConverter", + "Desc": "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", + "FriendlyName": "Key To Value Transform", + "ShortName": null, "Inputs": [ { "Name": "Column", @@ -16031,7 +15577,7 @@ ] } }, - "Desc": "Columns to drop the NAs for", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], @@ -16068,41 +15614,35 @@ ] }, { - "Name": "Transforms.MissingValuesRowDropper", - "Desc": "Filters out rows that contain missing values.", - "FriendlyName": "NA Filter", - "ShortName": "NAFilter", + "Name": "Transforms.LabelColumnKeyBooleanConverter", + "Desc": "Transforms the label to either key or bool (if needed) to make it suitable for classification.", + "FriendlyName": "Prepare Classification Label", + "ShortName": null, "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Column", - "Aliases": [ - "col" - ], + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", + "Name": "LabelColumn", + "Type": "String", + "Desc": "The label column", "Required": true, - "SortOrder": 1.0, + "SortOrder": 2.0, "IsNullable": false }, { - "Name": "Complement", + "Name": "TextKeyValues", "Type": "Bool", - "Desc": "If true, keep only rows that contain NA values, and filter the rest.", + "Desc": "Convert the key values to text", "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": false + "Default": true } ], "Outputs": [ @@ -16125,10 +15665,10 @@ ] }, { - "Name": "Transforms.MissingValueSubstitutor", - "Desc": "Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).", - "FriendlyName": "NA Replace Transform", - "ShortName": "NARep", + "Name": "Transforms.LabelIndicator", + "Desc": "Label remapper used by OVA", + "FriendlyName": "LabelIndicator", + "ShortName": "LabelIndictator", "Inputs": [ { "Name": "Column", @@ -16138,41 +15678,14 @@ "Kind": "Struct", "Fields": [ { - "Name": "ReplacementString", - "Type": "String", - "Desc": "Replacement value for NAs (uses default value if not given)", + "Name": "ClassIndex", + "Type": "Int", + "Desc": "The positive example class for binary classification.", "Aliases": [ - "rep" + "index" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Kind", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultValue", - "Mean", - "Minimum", - "Maximum", - "SpecifiedValue" - ] - }, - "Desc": "The replacement method to utilize", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Slot", - "Type": "Bool", - "Desc": "Whether to impute values by slot", - "Required": false, - "SortOrder": 150.0, "IsNullable": true, "Default": null }, @@ -16203,13 +15716,14 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:rep:src)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -16220,37 +15734,16 @@ "IsNullable": false }, { - "Name": "ReplacementKind", - "Type": { - "Kind": "Enum", - "Values": [ - "DefaultValue", - "Mean", - "Minimum", - "Maximum", - "SpecifiedValue" - ] - }, - "Desc": "The replacement method to utilize", - "Aliases": [ - "kind" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "Default" - }, - { - "Name": "ImputeBySlot", - "Type": "Bool", - "Desc": "Whether to impute values by slot", + "Name": "ClassIndex", + "Type": "Int", + "Desc": "Label of the positive class.", "Aliases": [ - "slot" + "index" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 0 } ], "Outputs": [ @@ -16273,38 +15766,61 @@ ] }, { - "Name": "Transforms.ModelCombiner", - "Desc": "Combines a sequence of TransformModels into a single model", - "FriendlyName": null, + "Name": "Transforms.LabelToFloatConverter", + "Desc": "Transforms the label to float to make it suitable for regression.", + "FriendlyName": "Prepare Regression Label", "ShortName": null, "Inputs": [ { - "Name": "Models", - "Type": { - "Kind": "Array", - "ItemType": "TransformModel" - }, - "Desc": "Input models", - "Required": false, + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "The label column", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false } ], "Outputs": [ { - "Name": "OutputModel", + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", "Type": "TransformModel", - "Desc": "Combined model" + "Desc": "Transform model" } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" ] }, { - "Name": "Transforms.NGramTranslator", - "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.", - "FriendlyName": "NGram Transform", - "ShortName": "NgramTransform", + "Name": "Transforms.LightLda", + "Desc": "The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation.", + "FriendlyName": "Latent Dirichlet Allocation Transform", + "ShortName": "LightLda", "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, { "Name": "Column", "Type": { @@ -16313,11 +15829,47 @@ "Kind": "Struct", "Fields": [ { - "Name": "NgramLength", + "Name": "NumTopic", "Type": "Int", - "Desc": "Maximum ngram length", + "Desc": "The number of topics in the LDA", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AlphaSum", + "Type": "Float", + "Desc": "Dirichlet prior on document-topic vectors", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Beta", + "Type": "Float", + "Desc": "Dirichlet prior on vocab-topic vectors", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Mhstep", + "Type": "Int", + "Desc": "Number of Metropolis Hasting step", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Number of iterations", "Aliases": [ - "ngram" + "iter" ], "Required": false, "SortOrder": 150.0, @@ -16325,11 +15877,11 @@ "Default": null }, { - "Name": "AllLengths", - "Type": "Bool", - "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength", + "Name": "LikelihoodInterval", + "Type": "Int", + "Desc": "Compute log likelihood over local dataset on this iteration interval", "Aliases": [ - "all" + "llInterval" ], "Required": false, "SortOrder": 150.0, @@ -16337,11 +15889,11 @@ "Default": null }, { - "Name": "SkipLength", + "Name": "NumThreads", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Desc": "The number of training threads", "Aliases": [ - "skips" + "t" ], "Required": false, "SortOrder": 150.0, @@ -16349,31 +15901,48 @@ "Default": null }, { - "Name": "MaxNumTerms", - "Type": { - "Kind": "Array", - "ItemType": "Int" - }, - "Desc": "Maximum number of ngrams to store in the dictionary", + "Name": "NumMaxDocToken", + "Type": "Int", + "Desc": "The threshold of maximum count of tokens per doc", "Aliases": [ - "max" + "maxNumToken" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": true, "Default": null }, { - "Name": "Weighting", - "Type": { - "Kind": "Enum", - "Values": [ - "Tf", - "Idf", - "TfIdf" - ] - }, - "Desc": "Statistical measure used to evaluate how important a word is to a document in a corpus", + "Name": "NumSummaryTermPerTopic", + "Type": "Int", + "Desc": "The number of words to summarize the topic", + "Aliases": [ + "ns" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "NumBurninIterations", + "Type": "Int", + "Desc": "The number of burn-in iterations", + "Aliases": [ + "burninIter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": 10 + }, + { + "Name": "ResetRandomGenerator", + "Type": "Bool", + "Desc": "Reset the random number generator for each document", + "Aliases": [ + "reset" + ], "Required": false, "SortOrder": 150.0, "IsNullable": true, @@ -16406,173 +15975,199 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:srcs)", "Aliases": [ "col" ], - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", "Required": true, - "SortOrder": 1.0, + "SortOrder": 49.0, "IsNullable": false }, { - "Name": "NgramLength", + "Name": "NumTopic", "Type": "Int", - "Desc": "Maximum ngram length", - "Aliases": [ - "ngram" - ], + "Desc": "The number of topics in the LDA", "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 2 + "Default": 100, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 20, + 40, + 100, + 200 + ] + } }, { - "Name": "AllLengths", - "Type": "Bool", - "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength", + "Name": "NumMaxDocToken", + "Type": "Int", + "Desc": "The threshold of maximum count of tokens per doc", "Aliases": [ - "all" + "maxNumToken" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": true + "Default": 512 }, { - "Name": "SkipLength", + "Name": "NumThreads", "Type": "Int", - "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Desc": "The number of training threads. Default value depends on number of logical processors.", "Aliases": [ - "skips" + "t" ], "Required": false, + "SortOrder": 50.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AlphaSum", + "Type": "Float", + "Desc": "Dirichlet prior on document-topic vectors", + "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 100.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 10, + 100, + 200 + ] + } }, { - "Name": "MaxNumTerms", - "Type": { - "Kind": "Array", - "ItemType": "Int" - }, - "Desc": "Maximum number of ngrams to store in the dictionary", - "Aliases": [ - "max" - ], + "Name": "Beta", + "Type": "Float", + "Desc": "Dirichlet prior on vocab-topic vectors", "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": [ - 10000000 - ] + "Default": 0.01, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.01, + 0.015, + 0.07, + 0.02 + ] + } }, { - "Name": "Weighting", - "Type": { - "Kind": "Enum", + "Name": "Mhstep", + "Type": "Int", + "Desc": "Number of Metropolis Hasting step", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 4, + "SweepRange": { + "RangeType": "Discrete", "Values": [ - "Tf", - "Idf", - "TfIdf" + 2, + 4, + 8, + 16 ] - }, - "Desc": "The weighting criteria", + } + }, + { + "Name": "NumIterations", + "Type": "Int", + "Desc": "Number of iterations", + "Aliases": [ + "iter" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Tf" - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Default": 200, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 100, + 200, + 300, + 400 + ] + } }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.NoOperation", - "Desc": "Does nothing.", - "FriendlyName": "No Op", - "ShortName": "Nop", - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ + "Name": "LikelihoodInterval", + "Type": "Int", + "Desc": "Compute log likelihood over local dataset on this iteration interval", + "Aliases": [ + "llInterval" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5 + }, { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "NumSummaryTermPerTopic", + "Type": "Int", + "Desc": "The number of words to summarize the topic", + "Aliases": [ + "ns" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 10 }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.OptionalColumnCreator", - "Desc": "If the source column does not exist after deserialization, create a column with the right type and default values.", - "FriendlyName": "Optional Column Transform", - "ShortName": "optional", - "Inputs": [ + "Name": "NumBurninIterations", + "Type": "Int", + "Desc": "The number of burn-in iterations", + "Aliases": [ + "burninIter" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 10, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 10, + 20, + 30, + 40 + ] + } + }, { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "New column definition(s)", + "Name": "ResetRandomGenerator", + "Type": "Bool", + "Desc": "Reset the random number generator for each document", "Aliases": [ - "col" + "reset" ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "OutputTopicWordSummary", + "Type": "Bool", + "Desc": "Whether to output the topic-word summary in text format", + "Aliases": [ + "summary" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false } ], "Outputs": [ @@ -16595,10 +16190,10 @@ ] }, { - "Name": "Transforms.PcaCalculator", - "Desc": "Train an PCA Anomaly model.", - "FriendlyName": "Principal Component Analysis Transform", - "ShortName": "Pca", + "Name": "Transforms.LogMeanVarianceNormalizer", + "Desc": "Normalizes the data based on the computed mean and variance of the logarithm of the data.", + "FriendlyName": "LogMeanVar Normalizer", + "ShortName": "LogMeanVar", "Inputs": [ { "Name": "Column", @@ -16608,87 +16203,39 @@ "Kind": "Struct", "Fields": [ { - "Name": "WeightColumn", - "Type": "String", - "Desc": "The name of the weight column", + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "weight" + "maxtrain" ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": true, "Default": null }, { - "Name": "Rank", - "Type": "Int", - "Desc": "The number of components in the PCA", + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", "Aliases": [ - "k" + "name" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null }, { - "Name": "Oversampling", - "Type": "Int", - "Desc": "Oversampling parameter for randomized PCA training", + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", "Aliases": [ - "over" + "src" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Center", - "Type": "Bool", - "Desc": "If enabled, data is centered to be zero mean", - "Aliases": [ - "center" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Seed", - "Type": "Int", - "Desc": "The seed for random number generation", - "Aliases": [ - "seed" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": false, "Default": null } ] @@ -16698,9 +16245,10 @@ "Aliases": [ "col" ], - "Required": true, + "Required": false, "SortOrder": 1.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "Data", @@ -16711,100 +16259,28 @@ "IsNullable": false }, { - "Name": "WeightColumn", - "Type": "String", - "Desc": "The name of the weight column", - "Aliases": [ - "weight" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Rank", - "Type": "Int", - "Desc": "The number of components in the PCA", + "Name": "UseCdf", + "Type": "Bool", + "Desc": "Whether to use CDF as the output", "Aliases": [ - "k" + "cdf" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 20 + "Default": true }, { - "Name": "Oversampling", + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "Oversampling parameter for randomized PCA training", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "over" + "maxtrain" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 20 - }, - { - "Name": "Center", - "Type": "Bool", - "Desc": "If enabled, data is centered to be zero mean", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": true - }, - { - "Name": "Seed", - "Type": "Int", - "Desc": "The seed for random number generation", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 0 - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.PredictedLabelColumnOriginalValueConverter", - "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.", - "FriendlyName": "Convert Predicted Label", - "ShortName": null, - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "PredictedLabelColumn", - "Type": "String", - "Desc": "The predicted label column", - "Required": true, - "SortOrder": 2.0, - "IsNullable": false + "Default": 1000000000 } ], "Outputs": [ @@ -16827,10 +16303,10 @@ ] }, { - "Name": "Transforms.RandomNumberGenerator", - "Desc": "Adds a column with a generated number sequence.", - "FriendlyName": "Generate Number Transform", - "ShortName": "Generate", + "Name": "Transforms.LpNormalizer", + "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm.", + "FriendlyName": "Lp-Norm Normalizer", + "ShortName": "lpnorm", "Inputs": [ { "Name": "Column", @@ -16840,42 +16316,62 @@ "Kind": "Struct", "Fields": [ { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", + "Name": "NormKind", + "Type": { + "Kind": "Enum", + "Values": [ + "L2Norm", + "StdDev", + "L1Norm", + "LInf" + ] + }, + "Desc": "The norm to use to normalize each sample", "Aliases": [ - "name" + "norm" ], "Required": false, - "SortOrder": 150.0, - "IsNullable": false, + "SortOrder": 1.0, + "IsNullable": true, "Default": null }, { - "Name": "UseCounter", + "Name": "SubMean", "Type": "Bool", - "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Desc": "Subtract mean from each value before normalizing", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", "Aliases": [ - "cnt" + "name" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed", + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null } ] } }, - "Desc": "New column definition(s) (optional form: name:seed)", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ "col" ], @@ -16883,6 +16379,26 @@ "SortOrder": 1.0, "IsNullable": false }, + { + "Name": "NormKind", + "Type": { + "Kind": "Enum", + "Values": [ + "L2Norm", + "StdDev", + "L1Norm", + "LInf" + ] + }, + "Desc": "The norm to use to normalize each sample", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": "L2Norm" + }, { "Name": "Data", "Type": "DataView", @@ -16892,25 +16408,13 @@ "IsNullable": false }, { - "Name": "UseCounter", + "Name": "SubMean", "Type": "Bool", - "Desc": "Use an auto-incremented integer starting at zero instead of a random number", - "Aliases": [ - "cnt" - ], + "Desc": "Subtract mean from each value before normalizing", "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, "Default": false - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 42 } ], "Outputs": [ @@ -16933,74 +16437,154 @@ ] }, { - "Name": "Transforms.RowRangeFilter", - "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.", - "FriendlyName": "Range Filter", - "ShortName": "RangeFilter", + "Name": "Transforms.ManyHeterogeneousModelCombiner", + "Desc": "Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel.", + "FriendlyName": null, + "ShortName": null, "Inputs": [ { - "Name": "Column", - "Type": "String", - "Desc": "Column", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", + "Name": "TransformModels", + "Type": { + "Kind": "Array", + "ItemType": "TransformModel" + }, + "Desc": "Transform model", "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Min", - "Type": "Float", - "Desc": "Minimum value (0 to 1 for key types)", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model" + } + ] + }, + { + "Name": "Transforms.MeanVarianceNormalizer", + "Desc": "Normalizes the data based on the computed mean and variance of the data.", + "FriendlyName": "MeanVar Normalizer", + "ShortName": "MeanVar", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "Max", - "Type": "Float", - "Desc": "Maximum value (0 to 1 for key types)", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "Complement", + "Name": "UseCdf", "Type": "Bool", - "Desc": "If true, keep the values that fall outside the range.", + "Desc": "Whether to use CDF as the output", + "Aliases": [ + "cdf" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": false }, { - "Name": "IncludeMin", + "Name": "FixZero", "Type": "Bool", - "Desc": "If true, include in the range the values that are equal to min.", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": true }, { - "Name": "IncludeMax", - "Type": "Bool", - "Desc": "If true, include in the range the values that are equal to max.", + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null + "IsNullable": false, + "Default": 1000000000 } ], "Outputs": [ @@ -17023,22 +16607,76 @@ ] }, { - "Name": "Transforms.RowSkipAndTakeFilter", - "Desc": "Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging.", - "FriendlyName": "Skip and Take Filter", - "ShortName": "SkipTake", + "Name": "Transforms.MinMaxNormalizer", + "Desc": "Normalizes the data based on the observed minimum and maximum values of the data.", + "FriendlyName": "Min-Max Normalizer", + "ShortName": "MinMax", "Inputs": [ { - "Name": "Skip", - "Type": "Int", - "Desc": "Number of items to skip", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ - "s" + "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": true, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -17049,16 +16687,28 @@ "IsNullable": false }, { - "Name": "Take", + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "MaxTrainingExamples", "Type": "Int", - "Desc": "Number of items to take", + "Desc": "Max number of examples used to train the normalizer", "Aliases": [ - "t" + "maxtrain" ], "Required": false, - "SortOrder": 2.0, - "IsNullable": true, - "Default": null + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000000 } ], "Outputs": [ @@ -17081,24 +16731,93 @@ ] }, { - "Name": "Transforms.RowSkipFilter", - "Desc": "Allows limiting input to a subset of rows by skipping a number of rows.", - "FriendlyName": "Skip Filter", - "ShortName": "Skip", + "Name": "Transforms.MissingValueHandler", + "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.", + "FriendlyName": "NA Handle Transform", + "ShortName": "NAHandle", "Inputs": [ { - "Name": "Count", - "Type": "Int", - "Desc": "Number of items to skip", - "Aliases": [ - "c", - "n", - "s" + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Kind", + "Type": { + "Kind": "Enum", + "Values": [ + "DefaultValue", + "Mean", + "Minimum", + "Maximum" + ] + }, + "Desc": "The replacement method to utilize", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ImputeBySlot", + "Type": "Bool", + "Desc": "Whether to impute values by slot", + "Aliases": [ + "slot" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "ConcatIndicator", + "Type": "Bool", + "Desc": "Whether or not to concatenate an indicator vector column to the value column", + "Aliases": [ + "ind" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:rep:src)", + "Aliases": [ + "col" ], "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": 0 + "IsNullable": false }, { "Name": "Data", @@ -17107,6 +16826,50 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "ReplaceWith", + "Type": { + "Kind": "Enum", + "Values": [ + "DefaultValue", + "Mean", + "Minimum", + "Maximum" + ] + }, + "Desc": "The replacement method to utilize", + "Aliases": [ + "kind" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Def" + }, + { + "Name": "ImputeBySlot", + "Type": "Bool", + "Desc": "Whether to impute values by slot", + "Aliases": [ + "slot" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Concat", + "Type": "Bool", + "Desc": "Whether or not to concatenate an indicator vector column to the value column", + "Aliases": [ + "ind" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true } ], "Outputs": [ @@ -17129,24 +16892,52 @@ ] }, { - "Name": "Transforms.RowTakeFilter", - "Desc": "Allows limiting input to a subset of rows by taking N first rows.", - "FriendlyName": "Take Filter", - "ShortName": "Take", + "Name": "Transforms.MissingValueIndicator", + "Desc": "Create a boolean output column with the same number of slots as the input column, where the output value is true if the value in the input column is missing.", + "FriendlyName": "NA Indicator Transform", + "ShortName": "NAInd", "Inputs": [ { - "Name": "Count", - "Type": "Int", - "Desc": "Number of items to take", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ - "c", - "n", - "t" + "col" ], "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": 9223372036854775807 + "IsNullable": false }, { "Name": "Data", @@ -17177,11 +16968,53 @@ ] }, { - "Name": "Transforms.ScoreColumnSelector", - "Desc": "Selects only the last score columns and the extra columns specified in the arguments.", - "FriendlyName": "Choose Columns By Index", - "ShortName": null, + "Name": "Transforms.MissingValuesDropper", + "Desc": "Removes NAs from vector columns.", + "FriendlyName": "NA Drop Transform", + "ShortName": "NADrop", "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "Columns to drop the NAs for", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, { "Name": "Data", "Type": "DataView", @@ -17189,18 +17022,6 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false - }, - { - "Name": "ExtraColumns", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Extra columns to write", - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": null } ], "Outputs": [ @@ -17223,107 +17044,18 @@ ] }, { - "Name": "Transforms.Scorer", - "Desc": "Turn the predictor model into a transform model", - "FriendlyName": null, - "ShortName": null, + "Name": "Transforms.MissingValuesRowDropper", + "Desc": "Filters out rows that contain missing values.", + "FriendlyName": "NA Filter", + "ShortName": "NAFilter", "Inputs": [ - { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "The predictor model to turn into a transform", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - } - ], - "Outputs": [ - { - "Name": "ScoredData", - "Type": "DataView", - "Desc": "The scored dataset" - }, - { - "Name": "ScoringTransform", - "Type": "TransformModel", - "Desc": "The scoring transform" - } - ] - }, - { - "Name": "Transforms.Segregator", - "Desc": "Un-groups vector columns into sequences of rows, inverse of Group transform", - "FriendlyName": "Un-group Transform", - "ShortName": "Ungroup", - "Inputs": [ - { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, { "Name": "Column", "Type": { "Kind": "Array", "ItemType": "String" }, - "Desc": "Columns to unroll, or 'pivot'", - "Aliases": [ - "col" - ], - "Required": true, - "SortOrder": 150.0, - "IsNullable": false - }, - { - "Name": "Mode", - "Type": { - "Kind": "Enum", - "Values": [ - "Inner", - "Outer", - "First" - ] - }, - "Desc": "Specifies how to unroll multiple pivot columns of different size.", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "Inner" - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" - }, - { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" - } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" - ] - }, - { - "Name": "Transforms.SentimentAnalyzer", - "Desc": "Uses a pretrained sentiment model to score input strings", - "FriendlyName": "Sentiment Analyzing Transform", - "ShortName": "Senti", - "Inputs": [ - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column.", + "Desc": "Column", "Aliases": [ "col" ], @@ -17340,16 +17072,13 @@ "IsNullable": false }, { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column.", - "Aliases": [ - "dst" - ], + "Name": "Complement", + "Type": "Bool", + "Desc": "If true, keep only rows that contain NA values, and filter the rest.", "Required": false, - "SortOrder": 2.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false } ], "Outputs": [ @@ -17372,10 +17101,10 @@ ] }, { - "Name": "Transforms.SupervisedBinNormalizer", - "Desc": "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins.", - "FriendlyName": "Supervised Binning Normalizer", - "ShortName": "SupBin", + "Name": "Transforms.MissingValueSubstitutor", + "Desc": "Create an output column of the same type and size of the input column, where missing values are replaced with either the default value or the mean/min/max value (for non-text columns only).", + "FriendlyName": "NA Replace Transform", + "ShortName": "NARep", "Inputs": [ { "Name": "Column", @@ -17385,36 +17114,39 @@ "Kind": "Struct", "Fields": [ { - "Name": "NumBins", - "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", + "Name": "ReplacementString", + "Type": "String", + "Desc": "Replacement value for NAs (uses default value if not given)", "Aliases": [ - "bins" + "rep" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null }, { - "Name": "FixZero", - "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", - "Aliases": [ - "zero" - ], + "Name": "Kind", + "Type": { + "Kind": "Enum", + "Values": [ + "DefaultValue", + "Mean", + "Minimum", + "Maximum", + "SpecifiedValue" + ] + }, + "Desc": "The replacement method to utilize", "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], + "Name": "Slot", + "Type": "Bool", + "Desc": "Whether to impute values by slot", "Required": false, "SortOrder": 150.0, "IsNullable": true, @@ -17447,14 +17179,13 @@ ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:rep:src)", "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -17465,61 +17196,37 @@ "IsNullable": false }, { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Label column for supervised binning", - "Aliases": [ - "label", - "lab" - ], - "Required": true, - "SortOrder": 150.0, - "IsNullable": false - }, - { - "Name": "MinBinSize", - "Type": "Int", - "Desc": "Minimum number of examples per bin", - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 10 - }, - { - "Name": "NumBins", - "Type": "Int", - "Desc": "Max number of bins, power of 2 recommended", + "Name": "ReplacementKind", + "Type": { + "Kind": "Enum", + "Values": [ + "DefaultValue", + "Mean", + "Minimum", + "Maximum", + "SpecifiedValue" + ] + }, + "Desc": "The replacement method to utilize", "Aliases": [ - "bins" + "kind" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1024 + "Default": "Default" }, { - "Name": "FixZero", + "Name": "ImputeBySlot", "Type": "Bool", - "Desc": "Whether to map zero to zero, preserving sparsity", + "Desc": "Whether to impute values by slot", "Aliases": [ - "zero" + "slot" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, "Default": true - }, - { - "Name": "MaxTrainingExamples", - "Type": "Int", - "Desc": "Max number of examples used to train the normalizer", - "Aliases": [ - "maxtrain" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": 1000000000 } ], "Outputs": [ @@ -17542,288 +17249,306 @@ ] }, { - "Name": "Transforms.TextFeaturizer", - "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.", - "FriendlyName": "Text Transform", - "ShortName": "Text", + "Name": "Transforms.ModelCombiner", + "Desc": "Combines a sequence of TransformModels into a single model", + "FriendlyName": null, + "ShortName": null, "Inputs": [ { - "Name": "Column", + "Name": "Models", "Type": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] + "Kind": "Array", + "ItemType": "TransformModel" }, - "Desc": "New column definition (optional form: name:srcs).", - "Aliases": [ - "col" - ], - "Required": true, + "Desc": "Input models", + "Required": false, "SortOrder": 1.0, - "IsNullable": false - }, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false - }, + "Name": "OutputModel", + "Type": "TransformModel", + "Desc": "Combined model" + } + ] + }, + { + "Name": "Transforms.NGramTranslator", + "Desc": "Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag.", + "FriendlyName": "NGram Transform", + "ShortName": "NgramTransform", + "Inputs": [ { - "Name": "Language", + "Name": "Column", "Type": { - "Kind": "Enum", - "Values": [ - "English", - "French", - "German", - "Dutch", - "Italian", - "Spanish", - "Japanese" - ] + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "NgramLength", + "Type": "Int", + "Desc": "Maximum ngram length", + "Aliases": [ + "ngram" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "AllLengths", + "Type": "Bool", + "Desc": "Whether to include all ngram lengths up to NgramLength or only NgramLength", + "Aliases": [ + "all" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "SkipLength", + "Type": "Int", + "Desc": "Maximum number of tokens to skip when constructing an ngram", + "Aliases": [ + "skips" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxNumTerms", + "Type": { + "Kind": "Array", + "ItemType": "Int" + }, + "Desc": "Maximum number of ngrams to store in the dictionary", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Weighting", + "Type": { + "Kind": "Enum", + "Values": [ + "Tf", + "Idf", + "TfIdf" + ] + }, + "Desc": "Statistical measure used to evaluate how important a word is to a document in a corpus", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } }, - "Desc": "Dataset language or 'AutoDetect' to detect language per row.", + "Desc": "New column definition(s) (optional form: name:src)", "Aliases": [ - "lang" + "col" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": "English" + "Default": null }, { - "Name": "StopWordsRemover", - "Type": { - "Kind": "Component", - "ComponentKind": "StopWordsRemover" - }, - "Desc": "Stopwords remover.", - "Aliases": [ - "remover" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": null + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "TextCase", - "Type": { - "Kind": "Enum", - "Values": [ - "Lower", - "Upper", - "None" - ] - }, - "Desc": "Casing text using the rules of the invariant culture.", + "Name": "NgramLength", + "Type": "Int", + "Desc": "Maximum ngram length", "Aliases": [ - "case" + "ngram" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Lower" + "Default": 2 }, { - "Name": "KeepDiacritics", + "Name": "AllLengths", "Type": "Bool", - "Desc": "Whether to keep diacritical marks or remove them.", + "Desc": "Whether to store all ngram lengths up to ngramLength, or only ngramLength", "Aliases": [ - "diac" + "all" ], "Required": false, - "SortOrder": 6.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true }, { - "Name": "KeepPunctuations", - "Type": "Bool", - "Desc": "Whether to keep punctuation marks or remove them.", + "Name": "SkipLength", + "Type": "Int", + "Desc": "Maximum number of tokens to skip when constructing an ngram", "Aliases": [ - "punc" + "skips" ], "Required": false, - "SortOrder": 7.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": 0 }, { - "Name": "KeepNumbers", - "Type": "Bool", - "Desc": "Whether to keep numbers or remove them.", + "Name": "MaxNumTerms", + "Type": { + "Kind": "Array", + "ItemType": "Int" + }, + "Desc": "Maximum number of ngrams to store in the dictionary", "Aliases": [ - "num" + "max" ], "Required": false, - "SortOrder": 8.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": true + "Default": [ + 10000000 + ] }, { - "Name": "OutputTokens", - "Type": "Bool", - "Desc": "Whether to output the transformed text tokens as an additional column.", - "Aliases": [ - "tokens", - "showtext", - "showTransformedText" - ], + "Name": "Weighting", + "Type": { + "Kind": "Enum", + "Values": [ + "Tf", + "Idf", + "TfIdf" + ] + }, + "Desc": "The weighting criteria", "Required": false, - "SortOrder": 9.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": "Tf" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "Dictionary", - "Type": { - "Kind": "Struct", - "Fields": [ - { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", - "Required": false, - "SortOrder": 1.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": "Occurrence" - }, - { - "Name": "DropUnknowns", - "Type": "Bool", - "Desc": "Drop unknown terms instead of mapping them to NA term.", - "Aliases": [ - "dropna" - ], - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": false - } - ] - }, - "Desc": "A dictionary of whitelisted terms.", - "Aliases": [ - "dict" - ], - "Required": false, - "SortOrder": 10.0, - "IsNullable": false, - "Default": null - }, + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.NoOperation", + "Desc": "Does nothing.", + "FriendlyName": "No Op", + "ShortName": "Nop", + "Inputs": [ { - "Name": "WordFeatureExtractor", - "Type": { - "Kind": "Component", - "ComponentKind": "NgramExtractor" - }, - "Desc": "Ngram feature extractor to use for words (WordBag/WordHashBag).", - "Aliases": [ - "wordExtractor" - ], - "Required": false, - "SortOrder": 11.0, - "IsNullable": false, - "Default": { - "Name": "NGram", - "Settings": { - "MaxNumTerms": [ - 10000000 - ] - } - } + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "CharFeatureExtractor", + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.OptionalColumnCreator", + "Desc": "If the source column does not exist after deserialization, create a column with the right type and default values.", + "FriendlyName": "Optional Column Transform", + "ShortName": "optional", + "Inputs": [ + { + "Name": "Column", "Type": { - "Kind": "Component", - "ComponentKind": "NgramExtractor" + "Kind": "Array", + "ItemType": "String" }, - "Desc": "Ngram feature extractor to use for characters (WordBag/WordHashBag).", + "Desc": "New column definition(s)", "Aliases": [ - "charExtractor" + "col" ], - "Required": false, - "SortOrder": 12.0, - "IsNullable": false, - "Default": { - "Name": "NGram", - "Settings": { - "NgramLength": 3, - "AllLengths": false, - "MaxNumTerms": [ - 10000000 - ] - } - } + "Required": true, + "SortOrder": 1.0, + "IsNullable": false }, { - "Name": "VectorNormalizer", - "Type": { - "Kind": "Enum", - "Values": [ - "None", - "L1", - "L2", - "LInf" - ] - }, - "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.", - "Aliases": [ - "norm" - ], - "Required": false, - "SortOrder": 13.0, - "IsNullable": false, - "Default": "L2" + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false } ], "Outputs": [ @@ -17846,10 +17571,10 @@ ] }, { - "Name": "Transforms.TextToKeyConverter", - "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", - "FriendlyName": "Term Transform", - "ShortName": null, + "Name": "Transforms.PcaCalculator", + "Desc": "Train an PCA Anomaly model.", + "FriendlyName": "Principal Component Analysis Transform", + "ShortName": "Pca", "Inputs": [ { "Name": "Column", @@ -17859,11 +17584,23 @@ "Kind": "Struct", "Fields": [ { - "Name": "MaxNumTerms", + "Name": "WeightColumn", + "Type": "String", + "Desc": "The name of the weight column", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Rank", "Type": "Int", - "Desc": "Maximum number of terms to keep when auto-training", + "Desc": "The number of components in the PCA", "Aliases": [ - "max" + "k" ], "Required": false, "SortOrder": 150.0, @@ -17871,38 +17608,35 @@ "Default": null }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", + "Name": "Oversampling", + "Type": "Int", + "Desc": "Oversampling parameter for randomized PCA training", + "Aliases": [ + "over" + ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": true, "Default": null }, { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Name": "Center", + "Type": "Bool", + "Desc": "If enabled, data is centered to be zero mean", + "Aliases": [ + "center" + ], "Required": false, "SortOrder": 150.0, "IsNullable": true, "Default": null }, { - "Name": "TextKeyValues", - "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Name": "Seed", + "Type": "Int", + "Desc": "The seed for random number generation", "Aliases": [ - "textkv" + "seed" ], "Required": false, "SortOrder": 150.0, @@ -17940,10 +17674,9 @@ "Aliases": [ "col" ], - "Required": false, + "Required": true, "SortOrder": 1.0, - "IsNullable": false, - "Default": null + "IsNullable": false }, { "Name": "Data", @@ -17954,55 +17687,58 @@ "IsNullable": false }, { - "Name": "MaxNumTerms", - "Type": "Int", - "Desc": "Maximum number of terms to keep per column when auto-training", + "Name": "WeightColumn", + "Type": "String", + "Desc": "The name of the weight column", "Aliases": [ - "max" + "weight" ], "Required": false, - "SortOrder": 5.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 1000000 + "Default": null }, { - "Name": "Term", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "List of terms", - "Required": false, - "SortOrder": 106.0, - "IsNullable": false, - "Default": null + "Name": "Rank", + "Type": "Int", + "Desc": "The number of components in the PCA", + "Aliases": [ + "k" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 20 }, { - "Name": "Sort", - "Type": { - "Kind": "Enum", - "Values": [ - "Occurrence", - "Value" - ] - }, - "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Name": "Oversampling", + "Type": "Int", + "Desc": "Oversampling parameter for randomized PCA training", + "Aliases": [ + "over" + ], "Required": false, - "SortOrder": 113.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": "Occurrence" + "Default": 20 }, { - "Name": "TextKeyValues", + "Name": "Center", "Type": "Bool", - "Desc": "Whether key value metadata should be text, regardless of the actual input type", - "Aliases": [ - "textkv" - ], + "Desc": "If enabled, data is centered to be zero mean", "Required": false, - "SortOrder": 114.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": false + "Default": true + }, + { + "Name": "Seed", + "Type": "Int", + "Desc": "The seed for random number generation", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0 } ], "Outputs": [ @@ -18025,9 +17761,9 @@ ] }, { - "Name": "Transforms.TrainTestDatasetSplitter", - "Desc": "Split the dataset into train and test sets", - "FriendlyName": "Dataset Train-Test Split", + "Name": "Transforms.PredictedLabelColumnOriginalValueConverter", + "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.", + "FriendlyName": "Convert Predicted Label", "ShortName": null, "Inputs": [ { @@ -18039,85 +17775,118 @@ "IsNullable": false }, { - "Name": "Fraction", - "Type": "Float", - "Desc": "Fraction of training data", - "Required": false, - "SortOrder": 2.0, - "IsNullable": false, - "Default": 0.8 - }, - { - "Name": "StratificationColumn", + "Name": "PredictedLabelColumn", "Type": "String", - "Desc": "Stratification column", - "Aliases": [ - "strat" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": null + "Desc": "The predicted label column", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false } ], "Outputs": [ { - "Name": "TrainData", + "Name": "OutputData", "Type": "DataView", - "Desc": "Training data" + "Desc": "Transformed dataset" }, { - "Name": "TestData", - "Type": "DataView", - "Desc": "Testing data" + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" ] }, { - "Name": "Transforms.TreeLeafFeaturizer", - "Desc": "Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.", - "FriendlyName": "Tree Ensemble Featurization Transform", - "ShortName": "TreeFeat", + "Name": "Transforms.RandomNumberGenerator", + "Desc": "Adds a column with a generated number sequence.", + "FriendlyName": "Generate Number Transform", + "ShortName": "Generate", "Inputs": [ { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "UseCounter", + "Type": "Bool", + "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Aliases": [ + "cnt" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:seed)", + "Aliases": [ + "col" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Trainer to use", + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", "Required": true, - "SortOrder": 10.0, + "SortOrder": 1.0, "IsNullable": false }, { - "Name": "Suffix", - "Type": "String", - "Desc": "Output column: The suffix to append to the default column names", + "Name": "UseCounter", + "Type": "Bool", + "Desc": "Use an auto-incremented integer starting at zero instead of a random number", "Aliases": [ - "ex" + "cnt" ], "Required": false, - "SortOrder": 101.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": null + "Default": false }, { - "Name": "LabelPermutationSeed", - "Type": "Int", - "Desc": "If specified, determines the permutation seed for applying this featurizer to a multiclass problem.", - "Aliases": [ - "lps" - ], + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed", "Required": false, - "SortOrder": 102.0, + "SortOrder": 150.0, "IsNullable": false, - "Default": 0 + "Default": 42 } ], "Outputs": [ @@ -18133,7 +17902,6 @@ } ], "InputKind": [ - "IFeaturizerInput", "ITransformInput" ], "OutputKind": [ @@ -18141,624 +17909,2518 @@ ] }, { - "Name": "Transforms.TwoHeterogeneousModelCombiner", - "Desc": "Combines a TransformModel and a PredictorModel into a single PredictorModel.", - "FriendlyName": null, - "ShortName": null, + "Name": "Transforms.RowRangeFilter", + "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.", + "FriendlyName": "Range Filter", + "ShortName": "RangeFilter", "Inputs": [ { - "Name": "TransformModel", - "Type": "TransformModel", - "Desc": "Transform model", + "Name": "Column", + "Type": "String", + "Desc": "Column", + "Aliases": [ + "col" + ], "Required": true, "SortOrder": 1.0, "IsNullable": false }, { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model", + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", "Required": true, - "SortOrder": 2.0, + "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "Min", + "Type": "Float", + "Desc": "Minimum value (0 to 1 for key types)", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Max", + "Type": "Float", + "Desc": "Maximum value (0 to 1 for key types)", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Complement", + "Type": "Bool", + "Desc": "If true, keep the values that fall outside the range.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "IncludeMin", + "Type": "Bool", + "Desc": "If true, include in the range the values that are equal to min.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "IncludeMax", + "Type": "Bool", + "Desc": "If true, include in the range the values that are equal to max.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null } ], "Outputs": [ { - "Name": "PredictorModel", - "Type": "PredictorModel", - "Desc": "Predictor model" + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" ] }, { - "Name": "Transforms.WordTokenizer", - "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.", - "FriendlyName": "Tokenize Text Transform", - "ShortName": "TokenizeTextTransform", + "Name": "Transforms.RowSkipAndTakeFilter", + "Desc": "Allows limiting input to a subset of rows at an optional offset. Can be used to implement data paging.", + "FriendlyName": "Skip and Take Filter", + "ShortName": "SkipTake", "Inputs": [ { - "Name": "Column", - "Type": { - "Kind": "Array", - "ItemType": { - "Kind": "Struct", - "Fields": [ - { - "Name": "TermSeparators", - "Type": "String", - "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", - "Aliases": [ - "sep" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Name", - "Type": "String", - "Desc": "Name of the new column", - "Aliases": [ - "name" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "Source", - "Type": "String", - "Desc": "Name of the source column", - "Aliases": [ - "src" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": null - } - ] - } - }, - "Desc": "New column definition(s)", + "Name": "Skip", + "Type": "Int", + "Desc": "Number of items to skip", "Aliases": [ - "col" + "s" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Take", + "Type": "Int", + "Desc": "Number of items to take", + "Aliases": [ + "t" ], "Required": false, + "SortOrder": 2.0, + "IsNullable": true, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.RowSkipFilter", + "Desc": "Allows limiting input to a subset of rows by skipping a number of rows.", + "FriendlyName": "Skip Filter", + "ShortName": "Skip", + "Inputs": [ + { + "Name": "Count", + "Type": "Int", + "Desc": "Number of items to skip", + "Aliases": [ + "c", + "n", + "s" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 0 + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.RowTakeFilter", + "Desc": "Allows limiting input to a subset of rows by taking N first rows.", + "FriendlyName": "Take Filter", + "ShortName": "Take", + "Inputs": [ + { + "Name": "Count", + "Type": "Int", + "Desc": "Number of items to take", + "Aliases": [ + "c", + "n", + "t" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 9223372036854775807 + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.ScoreColumnSelector", + "Desc": "Selects only the last score columns and the extra columns specified in the arguments.", + "FriendlyName": "Choose Columns By Index", + "ShortName": null, + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "ExtraColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Extra columns to write", + "Required": false, + "SortOrder": 2.0, "IsNullable": false, "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" }, { - "Name": "Data", - "Type": "DataView", - "Desc": "Input dataset", - "Required": true, - "SortOrder": 1.0, - "IsNullable": false + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.Scorer", + "Desc": "Turn the predictor model into a transform model", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The predictor model to turn into a transform", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "ScoredData", + "Type": "DataView", + "Desc": "The scored dataset" + }, + { + "Name": "ScoringTransform", + "Type": "TransformModel", + "Desc": "The scoring transform" + } + ] + }, + { + "Name": "Transforms.Segregator", + "Desc": "Un-groups vector columns into sequences of rows, inverse of Group transform", + "FriendlyName": "Un-group Transform", + "ShortName": "Ungroup", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to unroll, or 'pivot'", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "Mode", + "Type": { + "Kind": "Enum", + "Values": [ + "Inner", + "Outer", + "First" + ] + }, + "Desc": "Specifies how to unroll multiple pivot columns of different size.", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "Inner" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.SentimentAnalyzer", + "Desc": "Uses a pretrained sentiment model to score input strings", + "FriendlyName": "Sentiment Analyzing Transform", + "ShortName": "Senti", + "Inputs": [ + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column.", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column.", + "Aliases": [ + "dst" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.SupervisedBinNormalizer", + "Desc": "Similar to BinNormalizer, but calculates bins based on correlation with the label column, not equi-density. The new value is bin_number / number_of_bins.", + "FriendlyName": "Supervised Binning Normalizer", + "ShortName": "SupBin", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", + "Aliases": [ + "bins" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Label column for supervised binning", + "Aliases": [ + "label", + "lab" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "MinBinSize", + "Type": "Int", + "Desc": "Minimum number of examples per bin", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 10 + }, + { + "Name": "NumBins", + "Type": "Int", + "Desc": "Max number of bins, power of 2 recommended", + "Aliases": [ + "bins" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1024 + }, + { + "Name": "FixZero", + "Type": "Bool", + "Desc": "Whether to map zero to zero, preserving sparsity", + "Aliases": [ + "zero" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "MaxTrainingExamples", + "Type": "Int", + "Desc": "Max number of examples used to train the normalizer", + "Aliases": [ + "maxtrain" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000000 + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TextFeaturizer", + "Desc": "A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text.", + "FriendlyName": "Text Transform", + "ShortName": "Text", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + }, + "Desc": "New column definition (optional form: name:srcs).", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Language", + "Type": { + "Kind": "Enum", + "Values": [ + "English", + "French", + "German", + "Dutch", + "Italian", + "Spanish", + "Japanese" + ] + }, + "Desc": "Dataset language or 'AutoDetect' to detect language per row.", + "Aliases": [ + "lang" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "English" + }, + { + "Name": "StopWordsRemover", + "Type": { + "Kind": "Component", + "ComponentKind": "StopWordsRemover" + }, + "Desc": "Stopwords remover.", + "Aliases": [ + "remover" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "TextCase", + "Type": { + "Kind": "Enum", + "Values": [ + "Lower", + "Upper", + "None" + ] + }, + "Desc": "Casing text using the rules of the invariant culture.", + "Aliases": [ + "case" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Lower" + }, + { + "Name": "KeepDiacritics", + "Type": "Bool", + "Desc": "Whether to keep diacritical marks or remove them.", + "Aliases": [ + "diac" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "KeepPunctuations", + "Type": "Bool", + "Desc": "Whether to keep punctuation marks or remove them.", + "Aliases": [ + "punc" + ], + "Required": false, + "SortOrder": 7.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "KeepNumbers", + "Type": "Bool", + "Desc": "Whether to keep numbers or remove them.", + "Aliases": [ + "num" + ], + "Required": false, + "SortOrder": 8.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "OutputTokens", + "Type": "Bool", + "Desc": "Whether to output the transformed text tokens as an additional column.", + "Aliases": [ + "tokens", + "showtext", + "showTransformedText" + ], + "Required": false, + "SortOrder": 9.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "Dictionary", + "Type": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "DropUnknowns", + "Type": "Bool", + "Desc": "Drop unknown terms instead of mapping them to NA term.", + "Aliases": [ + "dropna" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": false + } + ] + }, + "Desc": "A dictionary of whitelisted terms.", + "Aliases": [ + "dict" + ], + "Required": false, + "SortOrder": 10.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "WordFeatureExtractor", + "Type": { + "Kind": "Component", + "ComponentKind": "NgramExtractor" + }, + "Desc": "Ngram feature extractor to use for words (WordBag/WordHashBag).", + "Aliases": [ + "wordExtractor" + ], + "Required": false, + "SortOrder": 11.0, + "IsNullable": false, + "Default": { + "Name": "NGram", + "Settings": { + "MaxNumTerms": [ + 10000000 + ] + } + } + }, + { + "Name": "CharFeatureExtractor", + "Type": { + "Kind": "Component", + "ComponentKind": "NgramExtractor" + }, + "Desc": "Ngram feature extractor to use for characters (WordBag/WordHashBag).", + "Aliases": [ + "charExtractor" + ], + "Required": false, + "SortOrder": 12.0, + "IsNullable": false, + "Default": { + "Name": "NGram", + "Settings": { + "NgramLength": 3, + "AllLengths": false, + "MaxNumTerms": [ + 10000000 + ] + } + } + }, + { + "Name": "VectorNormalizer", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "L1", + "L2", + "LInf" + ] + }, + "Desc": "Normalize vectors (rows) individually by rescaling them to unit norm.", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 13.0, + "IsNullable": false, + "Default": "L2" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TextToKeyConverter", + "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", + "FriendlyName": "Term Transform", + "ShortName": null, + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep when auto-training", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "MaxNumTerms", + "Type": "Int", + "Desc": "Maximum number of terms to keep per column when auto-training", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": 1000000 + }, + { + "Name": "Term", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of terms", + "Required": false, + "SortOrder": 106.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Sort", + "Type": { + "Kind": "Enum", + "Values": [ + "Occurrence", + "Value" + ] + }, + "Desc": "How items should be ordered when vectorized. By default, they will be in the order encountered. If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').", + "Required": false, + "SortOrder": 113.0, + "IsNullable": false, + "Default": "Occurrence" + }, + { + "Name": "TextKeyValues", + "Type": "Bool", + "Desc": "Whether key value metadata should be text, regardless of the actual input type", + "Aliases": [ + "textkv" + ], + "Required": false, + "SortOrder": 114.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TrainTestDatasetSplitter", + "Desc": "Split the dataset into train and test sets", + "FriendlyName": "Dataset Train-Test Split", + "ShortName": null, + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Fraction", + "Type": "Float", + "Desc": "Fraction of training data", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 0.8 + }, + { + "Name": "StratificationColumn", + "Type": "String", + "Desc": "Stratification column", + "Aliases": [ + "strat" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [ + { + "Name": "TrainData", + "Type": "DataView", + "Desc": "Training data" + }, + { + "Name": "TestData", + "Type": "DataView", + "Desc": "Testing data" + } + ] + }, + { + "Name": "Transforms.TreeLeafFeaturizer", + "Desc": "Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices.", + "FriendlyName": "Tree Ensemble Featurization Transform", + "ShortName": "TreeFeat", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Trainer to use", + "Required": true, + "SortOrder": 10.0, + "IsNullable": false + }, + { + "Name": "Suffix", + "Type": "String", + "Desc": "Output column: The suffix to append to the default column names", + "Aliases": [ + "ex" + ], + "Required": false, + "SortOrder": 101.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LabelPermutationSeed", + "Type": "Int", + "Desc": "If specified, determines the permutation seed for applying this featurizer to a multiclass problem.", + "Aliases": [ + "lps" + ], + "Required": false, + "SortOrder": 102.0, + "IsNullable": false, + "Default": 0 + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "IFeaturizerInput", + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.TwoHeterogeneousModelCombiner", + "Desc": "Combines a TransformModel and a PredictorModel into a single PredictorModel.", + "FriendlyName": null, + "ShortName": null, + "Inputs": [ + { + "Name": "TransformModel", + "Type": "TransformModel", + "Desc": "Transform model", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model", + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "Predictor model" + } + ] + }, + { + "Name": "Transforms.WordTokenizer", + "Desc": "The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.", + "FriendlyName": "Tokenize Text Transform", + "ShortName": "TokenizeTextTransform", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "TermSeparators", + "Type": "String", + "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", + "Aliases": [ + "sep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s)", + "Aliases": [ + "col" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "TermSeparators", + "Type": "String", + "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", + "Aliases": [ + "sep" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "space" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + } + ], + "Components": [ + { + "Kind": "AutoMlEngine", + "Components": [ + { + "Name": "Defaults", + "Desc": "AutoML engine that returns learners with default settings.", + "FriendlyName": "Defaults Engine", + "Settings": [] + }, + { + "Name": "Rocket", + "Desc": "AutoML engine that consists of distinct, hierarchical stages of operation.", + "FriendlyName": "Rocket Engine", + "Settings": [ + { + "Name": "TopKLearners", + "Type": "Int", + "Desc": "Number of learners to retain for second stage.", + "Aliases": [ + "topk" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 2 + }, + { + "Name": "SecondRoundTrialsPerLearner", + "Type": "Int", + "Desc": "Number of trials for retained second stage learners.", + "Aliases": [ + "stage2num" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 5 + }, + { + "Name": "RandomInitialization", + "Type": "Bool", + "Desc": "Use random initialization only.", + "Aliases": [ + "randinit" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "NumInitializationPipelines", + "Type": "Int", + "Desc": "Number of initilization pipelines, used for random initialization only.", + "Aliases": [ + "numinitseeds" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 20 + } + ] + }, + { + "Name": "UniformRandom", + "Desc": "AutoML engine using uniform random sampling.", + "FriendlyName": "Uniform Random Engine", + "Settings": [] + } + ] + }, + { + "Kind": "AutoMlStateBase", + "Components": [ + { + "Name": "AutoMlState", + "Desc": "State of an AutoML search and search space.", + "FriendlyName": "AutoML State", + "Aliases": [ + "automlst" + ], + "Settings": [ + { + "Name": "Metric", + "Type": { + "Kind": "Enum", + "Values": [ + "Auc", + "AccuracyMicro", + "AccuracyMacro", + "L2", + "F1", + "AuPrc", + "TopKAccuracy", + "Rms", + "LossFn", + "RSquared", + "LogLoss", + "LogLossReduction", + "Ndcg", + "Dcg", + "PositivePrecision", + "PositiveRecall", + "NegativePrecision", + "NegativeRecall", + "DrAtK", + "DrAtPFpr", + "DrAtNumPos", + "NumAnomalies", + "ThreshAtK", + "ThreshAtP", + "ThreshAtNumPos", + "Nmi", + "AvgMinScore", + "Dbi" + ] + }, + "Desc": "Supported metric for evaluator.", + "Aliases": [ + "metric" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "Auc" + }, + { + "Name": "Engine", + "Type": { + "Kind": "Component", + "ComponentKind": "AutoMlEngine" + }, + "Desc": "AutoML engine (pipeline optimizer) that generates next candidates.", + "Aliases": [ + "engine" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "TrainerKind", + "Type": { + "Kind": "Enum", + "Values": [ + "SignatureBinaryClassifierTrainer", + "SignatureMultiClassClassifierTrainer", + "SignatureRankerTrainer", + "SignatureRegressorTrainer", + "SignatureMultiOutputRegressorTrainer", + "SignatureAnomalyDetectorTrainer", + "SignatureClusteringTrainer" + ] + }, + "Desc": "Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc.", + "Aliases": [ + "tk" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false, + "Default": "SignatureBinaryClassifierTrainer" + }, + { + "Name": "TerminatorArgs", + "Type": { + "Kind": "Component", + "ComponentKind": "SearchTerminator" + }, + "Desc": "Arguments for creating terminator, which determines when to stop search.", + "Aliases": [ + "term" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "RequestedLearners", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Learner set to sweep over (if available).", + "Aliases": [ + "learners" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + ] + }, + { + "Kind": "CalibratorTrainer", + "Components": [ + { + "Name": "FixedPlattCalibrator", + "Desc": null, + "FriendlyName": "Fixed Platt Calibrator", + "Aliases": [ + "FixedPlatt", + "FixedSigmoid" + ], + "Settings": [ + { + "Name": "Slope", + "Type": "Float", + "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Aliases": [ + "a" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + }, + { + "Name": "Offset", + "Type": "Float", + "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Aliases": [ + "b" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.0 + } + ] + }, + { + "Name": "NaiveCalibrator", + "Desc": null, + "FriendlyName": "Naive Calibrator", + "Aliases": [ + "Naive" + ], + "Settings": [] + }, + { + "Name": "PavCalibrator", + "Desc": null, + "FriendlyName": "PAV Calibrator", + "Aliases": [ + "Pav" + ], + "Settings": [] + }, + { + "Name": "PlattCalibrator", + "Desc": "Platt calibration.", + "FriendlyName": "Platt Calibrator", + "Aliases": [ + "Platt", + "Sigmoid" + ], + "Settings": [] + } + ] + }, + { + "Kind": "ClassificationLossFunction", + "Components": [ + { + "Name": "ExpLoss", + "Desc": "Exponential loss.", + "FriendlyName": "Exponential Loss", + "Settings": [ + { + "Name": "Beta", + "Type": "Float", + "Desc": "Beta (dilation)", + "Aliases": [ + "beta" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + }, + { + "Name": "HingeLoss", + "Desc": "Hinge loss.", + "FriendlyName": "Hinge loss", + "Aliases": [ + "Hinge" + ], + "Settings": [ + { + "Name": "Margin", + "Type": "Float", + "Desc": "Margin value", + "Aliases": [ + "marg" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + }, + { + "Name": "LogLoss", + "Desc": "Log loss.", + "FriendlyName": "Log loss", + "Aliases": [ + "Logistic", + "CrossEntropy" + ], + "Settings": [] + }, + { + "Name": "SmoothedHingeLoss", + "Desc": "Smoothed Hinge loss.", + "FriendlyName": "Smoothed Hinge Loss", + "Aliases": [ + "SmoothedHinge" + ], + "Settings": [ + { + "Name": "SmoothingConst", + "Type": "Float", + "Desc": "Smoothing constant", + "Aliases": [ + "smooth" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1.0 + } + ] + } + ] + }, + { + "Kind": "EarlyStoppingCriterion", + "Components": [ + { + "Name": "GL", + "Desc": "Stop in case of loss of generality.", + "FriendlyName": "Loss of Generality (GL)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + } + ] + }, + { + "Name": "LP", + "Desc": "Stops in case of low progress.", + "FriendlyName": "Low Progress (LP)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + }, + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + }, + { + "Name": "PQ", + "Desc": "Stops in case of generality to progress ration exceeds threshold.", + "FriendlyName": "Generality to Progress Ratio (PQ)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Threshold in range [0,1].", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Max": 1.0, + "Min": 0.0 + } + }, + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + }, + { + "Name": "TR", + "Desc": "Stop if validation score exceeds threshold value.", + "FriendlyName": "Tolerant (TR)", + "Settings": [ + { + "Name": "Threshold", + "Type": "Float", + "Desc": "Tolerance threshold. (Non negative value)", + "Aliases": [ + "th" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 0.01, + "Range": { + "Min": 0.0 + } + } + ] + }, + { + "Name": "UP", + "Desc": "Stops in case of consecutive loss in generality.", + "FriendlyName": "Consecutive Loss in Generality (UP)", + "Settings": [ + { + "Name": "WindowSize", + "Type": "Int", + "Desc": "The window size.", + "Aliases": [ + "w" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 5, + "Range": { + "Inf": 0 + } + } + ] + } + ] + }, + { + "Kind": "EnsembleBinaryOutputCombiner", + "Components": [ + { + "Name": "Average", + "Desc": null, + "FriendlyName": "Average", + "Settings": [] + }, + { + "Name": "Median", + "Desc": null, + "FriendlyName": "Median", + "Settings": [] + }, + { + "Name": "Stacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "Voting", + "Desc": null, + "FriendlyName": "Voting", + "Settings": [] + }, + { + "Name": "WeightedAverage", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ + { + "Name": "WeightageName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "Auc", + "PosPrecision", + "PosRecall", + "NegPrecision", + "NegRecall" + ] + }, + "Desc": "The metric type to be used to find the weights for each model", + "Aliases": [ + "wn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + } + ] + } + ] + }, + { + "Kind": "EnsembleBinarySubModelSelector", + "Components": [ + { + "Name": "AllSelector", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelector", + "Desc": null, + "FriendlyName": "Best Diverse Selector", + "Settings": [ + { + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", + "Aliases": [ + "dm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", + "Aliases": [ + "lp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + }, + { + "Name": "BestPerformanceSelector", + "Desc": null, + "FriendlyName": "Best Performance Selector", + "Settings": [ + { + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "Accuracy", + "PosPrecName", + "PosRecallName", + "NegPrecName", + "NegRecallName", + "Auc", + "LogLoss", + "LogLossReduction", + "F1", + "AuPrc" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "Auc" + }, + { + "Name": "LearnersSelectionProportion", + "Type": "Float", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", + "Aliases": [ + "lp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.5 + }, + { + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", + "Aliases": [ + "vp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.3 + } + ] + } + ] + }, + { + "Kind": "EnsembleDiversityMeasure", + "Components": [ + { + "Name": "DisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] }, { - "Name": "TermSeparators", - "Type": "String", - "Desc": "Comma separated set of term separator(s). Commonly: 'space', 'comma', 'semicolon' or other single character.", - "Aliases": [ - "sep" - ], - "Required": false, - "SortOrder": 150.0, - "IsNullable": false, - "Default": "space" - } - ], - "Outputs": [ - { - "Name": "OutputData", - "Type": "DataView", - "Desc": "Transformed dataset" + "Name": "MultiDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] }, { - "Name": "Model", - "Type": "TransformModel", - "Desc": "Transform model" + "Name": "RegressionDisagreementDiversityMeasure", + "Desc": null, + "FriendlyName": "Disagreement Diversity Measure", + "Settings": [] } - ], - "InputKind": [ - "ITransformInput" - ], - "OutputKind": [ - "ITransformOutput" ] - } - ], - "Components": [ + }, { - "Kind": "AutoMlEngine", + "Kind": "EnsembleFeatureSelector", "Components": [ { - "Name": "Defaults", - "Desc": "AutoML engine that returns learners with default settings.", - "FriendlyName": "Defaults Engine", + "Name": "AllFeatureSelector", + "Desc": null, + "FriendlyName": "All Feature Selector", "Settings": [] }, { - "Name": "Rocket", - "Desc": "AutoML engine that consists of distinct, hierarchical stages of operation.", - "FriendlyName": "Rocket Engine", + "Name": "RandomFeatureSelector", + "Desc": null, + "FriendlyName": "Random Feature Selector", "Settings": [ { - "Name": "TopKLearners", - "Type": "Int", - "Desc": "Number of learners to retain for second stage.", + "Name": "FeaturesSelectionProportion", + "Type": "Float", + "Desc": "The proportion of features to be selected. The range is 0.0-1.0", "Aliases": [ - "topk" + "fp" ], "Required": false, - "SortOrder": 1.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 2 - }, + "Default": 0.8 + } + ] + } + ] + }, + { + "Kind": "EnsembleMulticlassOutputCombiner", + "Components": [ + { + "Name": "MultiAverage", + "Desc": null, + "FriendlyName": "Average", + "Settings": [ { - "Name": "SecondRoundTrialsPerLearner", - "Type": "Int", - "Desc": "Number of trials for retained second stage learners.", + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", "Aliases": [ - "stage2num" + "norm" ], "Required": false, - "SortOrder": 2.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5 - }, + "Default": true + } + ] + }, + { + "Name": "MultiMedian", + "Desc": null, + "FriendlyName": "Median", + "Settings": [ { - "Name": "RandomInitialization", + "Name": "Normalize", "Type": "Bool", - "Desc": "Use random initialization only.", + "Desc": "Whether to normalize the output of base models before combining them", "Aliases": [ - "randinit" + "norm" ], "Required": false, - "SortOrder": 3.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": false - }, + "Default": true + } + ] + }, + { + "Name": "MultiStacking", + "Desc": null, + "FriendlyName": "Stacking", + "Settings": [ { - "Name": "NumInitializationPipelines", - "Type": "Int", - "Desc": "Number of initilization pipelines, used for random initialization only.", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "numinitseeds" + "vp" ], "Required": false, - "SortOrder": 4.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 20 + "Default": 0.3 } ] }, { - "Name": "UniformRandom", - "Desc": "AutoML engine using uniform random sampling.", - "FriendlyName": "Uniform Random Engine", - "Settings": [] - } - ] - }, - { - "Kind": "AutoMlStateBase", - "Components": [ - { - "Name": "AutoMlState", - "Desc": "State of an AutoML search and search space.", - "FriendlyName": "AutoML State", - "Aliases": [ - "automlst" - ], + "Name": "MultiVoting", + "Desc": null, + "FriendlyName": "Voting", "Settings": [ { - "Name": "Metric", - "Type": { - "Kind": "Enum", - "Values": [ - "Auc", - "AccuracyMicro", - "AccuracyMacro", - "L2", - "F1", - "AuPrc", - "TopKAccuracy", - "Rms", - "LossFn", - "RSquared", - "LogLoss", - "LogLossReduction", - "Ndcg", - "Dcg", - "PositivePrecision", - "PositiveRecall", - "NegativePrecision", - "NegativeRecall", - "DrAtK", - "DrAtPFpr", - "DrAtNumPos", - "NumAnomalies", - "ThreshAtK", - "ThreshAtP", - "ThreshAtNumPos", - "Nmi", - "AvgMinScore", - "Dbi" - ] - }, - "Desc": "Supported metric for evaluator.", + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", "Aliases": [ - "metric" + "norm" ], - "Required": true, - "SortOrder": 150.0, + "Required": false, + "SortOrder": 50.0, "IsNullable": false, - "Default": "Auc" - }, - { - "Name": "Engine", - "Type": { - "Kind": "Component", - "ComponentKind": "AutoMlEngine" - }, - "Desc": "AutoML engine (pipeline optimizer) that generates next candidates.", - "Aliases": [ - "engine" - ], - "Required": true, - "SortOrder": 150.0, - "IsNullable": false - }, + "Default": true + } + ] + }, + { + "Name": "MultiWeightedAverage", + "Desc": null, + "FriendlyName": "Multi Weighted Average", + "Settings": [ { - "Name": "TrainerKind", + "Name": "WeightageName", "Type": { "Kind": "Enum", "Values": [ - "SignatureBinaryClassifierTrainer", - "SignatureMultiClassClassifierTrainer", - "SignatureRankerTrainer", - "SignatureRegressorTrainer", - "SignatureMultiOutputRegressorTrainer", - "SignatureAnomalyDetectorTrainer", - "SignatureClusteringTrainer" + "AccuracyMicroAvg", + "AccuracyMacroAvg" ] }, - "Desc": "Kind of trainer for task, such as binary classification trainer, multiclass trainer, etc.", + "Desc": "The metric type to be used to find the weights for each model", "Aliases": [ - "tk" + "wn" ], - "Required": true, - "SortOrder": 150.0, + "Required": false, + "SortOrder": 50.0, "IsNullable": false, - "Default": "SignatureBinaryClassifierTrainer" - }, - { - "Name": "TerminatorArgs", - "Type": { - "Kind": "Component", - "ComponentKind": "SearchTerminator" - }, - "Desc": "Arguments for creating terminator, which determines when to stop search.", - "Aliases": [ - "term" - ], - "Required": true, - "SortOrder": 150.0, - "IsNullable": false + "Default": "AccuracyMicroAvg" }, { - "Name": "RequestedLearners", - "Type": { - "Kind": "Array", - "ItemType": "String" - }, - "Desc": "Learner set to sweep over (if available).", + "Name": "Normalize", + "Type": "Bool", + "Desc": "Whether to normalize the output of base models before combining them", "Aliases": [ - "learners" + "norm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": null + "Default": true } ] } ] }, { - "Kind": "CalibratorTrainer", + "Kind": "EnsembleMulticlassSubModelSelector", "Components": [ { - "Name": "FixedPlattCalibrator", + "Name": "AllSelectorMultiClass", "Desc": null, - "FriendlyName": "Fixed Platt Calibrator", - "Aliases": [ - "FixedPlatt", - "FixedSigmoid" - ], + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelectorMultiClass", + "Desc": null, + "FriendlyName": "Best Diverse Selector", "Settings": [ { - "Name": "Slope", + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", + "Aliases": [ + "dm" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "a" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.5 }, { - "Name": "Offset", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "b" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.0 + "Default": 0.3 } ] }, { - "Name": "NaiveCalibrator", - "Desc": null, - "FriendlyName": "Naive Calibrator", - "Aliases": [ - "Naive" - ], - "Settings": [] - }, - { - "Name": "PavCalibrator", + "Name": "BestPerformanceSelectorMultiClass", "Desc": null, - "FriendlyName": "PAV Calibrator", - "Aliases": [ - "Pav" - ], - "Settings": [] - }, - { - "Name": "PlattCalibrator", - "Desc": "Platt calibration.", - "FriendlyName": "Platt Calibrator", - "Aliases": [ - "Platt", - "Sigmoid" - ], - "Settings": [] - } - ] - }, - { - "Kind": "ClassificationLossFunction", - "Components": [ - { - "Name": "ExpLoss", - "Desc": "Exponential loss.", - "FriendlyName": "Exponential Loss", + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "Beta", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "AccuracyMicro", + "AccuracyMacro", + "LogLoss", + "LogLossReduction" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "AccuracyMicro" + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Beta (dilation)", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "beta" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 - } - ] - }, - { - "Name": "HingeLoss", - "Desc": "Hinge loss.", - "FriendlyName": "Hinge loss", - "Aliases": [ - "Hinge" - ], - "Settings": [ + "Default": 0.5 + }, { - "Name": "Margin", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Margin value", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "marg" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.3 } ] + } + ] + }, + { + "Kind": "EnsembleRegressionOutputCombiner", + "Components": [ + { + "Name": "Average", + "Desc": null, + "FriendlyName": "Average", + "Settings": [] }, { - "Name": "LogLoss", - "Desc": "Log loss.", - "FriendlyName": "Log loss", - "Aliases": [ - "Logistic", - "CrossEntropy" - ], + "Name": "Median", + "Desc": null, + "FriendlyName": "Median", "Settings": [] }, { - "Name": "SmoothedHingeLoss", - "Desc": "Smoothed Hinge loss.", - "FriendlyName": "Smoothed Hinge Loss", - "Aliases": [ - "SmoothedHinge" - ], + "Name": "RegressionStacking", + "Desc": null, + "FriendlyName": "Stacking", "Settings": [ { - "Name": "SmoothingConst", + "Name": "ValidationDatasetProportion", "Type": "Float", - "Desc": "Smoothing constant", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "smooth" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0 + "Default": 0.3 } ] } ] }, { - "Kind": "EarlyStoppingCriterion", + "Kind": "EnsembleRegressionSubModelSelector", "Components": [ { - "Name": "GL", - "Desc": "Stop in case of loss of generality.", - "FriendlyName": "Loss of Generality (GL)", + "Name": "AllSelector", + "Desc": null, + "FriendlyName": "All Selector", + "Settings": [] + }, + { + "Name": "BestDiverseSelectorRegression", + "Desc": null, + "FriendlyName": "Best Diverse Selector", "Settings": [ { - "Name": "Threshold", - "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Name": "DiversityMetricType", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleDiversityMeasure" + }, + "Desc": "The metric type to be used to find the diversity among base learners", "Aliases": [ - "th" + "dm" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } - } - ] - }, - { - "Name": "LP", - "Desc": "Stops in case of low progress.", - "FriendlyName": "Low Progress (LP)", - "Settings": [ + "Default": null + }, { - "Name": "Threshold", + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "th" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } + "Default": 0.5 }, { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "w" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 - } + "Default": 0.3 } ] }, { - "Name": "PQ", - "Desc": "Stops in case of generality to progress ration exceeds threshold.", - "FriendlyName": "Generality to Progress Ratio (PQ)", + "Name": "BestPerformanceRegressionSelector", + "Desc": null, + "FriendlyName": "Best Performance Selector", "Settings": [ { - "Name": "Threshold", + "Name": "MetricName", + "Type": { + "Kind": "Enum", + "Values": [ + "L1", + "L2", + "Rms", + "Loss", + "RSquared" + ] + }, + "Desc": "The metric type to be used to find the best performance", + "Aliases": [ + "mn" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": "L1" + }, + { + "Name": "LearnersSelectionProportion", "Type": "Float", - "Desc": "Threshold in range [0,1].", + "Desc": "The proportion of best base learners to be selected. The range is 0.0-1.0", "Aliases": [ - "th" + "lp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Max": 1.0, - "Min": 0.0 - } + "Default": 0.5 }, { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "ValidationDatasetProportion", + "Type": "Float", + "Desc": "The proportion of instances to be selected to test the individual base learner. If it is 0, it uses training set", "Aliases": [ - "w" + "vp" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 50.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 + "Default": 0.3 + } + ] + } + ] + }, + { + "Kind": "EnsembleSubsetSelector", + "Components": [ + { + "Name": "AllInstanceSelector", + "Desc": null, + "FriendlyName": "All Instance Selector", + "Settings": [ + { + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", + "Aliases": [ + "fs" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": { + "Name": "AllFeatureSelector" } } ] }, { - "Name": "TR", - "Desc": "Stop if validation score exceeds threshold value.", - "FriendlyName": "Tolerant (TR)", + "Name": "BootstrapSelector", + "Desc": null, + "FriendlyName": "Bootstrap Selector", "Settings": [ { - "Name": "Threshold", - "Type": "Float", - "Desc": "Tolerance threshold. (Non negative value)", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "th" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 0.01, - "Range": { - "Min": 0.0 + "Default": { + "Name": "AllFeatureSelector" } } ] }, { - "Name": "UP", - "Desc": "Stops in case of consecutive loss in generality.", - "FriendlyName": "Consecutive Loss in Generality (UP)", + "Name": "RandomPartitionSelector", + "Desc": null, + "FriendlyName": "Random Partition Selector", "Settings": [ { - "Name": "WindowSize", - "Type": "Int", - "Desc": "The window size.", + "Name": "FeatureSelector", + "Type": { + "Kind": "Component", + "ComponentKind": "EnsembleFeatureSelector" + }, + "Desc": "The Feature selector", "Aliases": [ - "w" + "fs" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 1.0, "IsNullable": false, - "Default": 5, - "Range": { - "Inf": 0 + "Default": { + "Name": "AllFeatureSelector" } } ] From 94a09d55e1a839ab108cf77c64e9972d81b45bfb Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 26 Jun 2018 12:53:20 -0700 Subject: [PATCH 17/20] fix tests --- test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 6130105a19..4c1fcdfd9a 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1814,19 +1814,19 @@ public void EntryPointLinearSVM() [Fact] public void EntryPointBinaryEnsemble() { - TestEntryPointRoutine("iris.txt", "Trainers.BinaryClassifierEnsemble"); + TestEntryPointRoutine("iris.txt", "Trainers.EnsembleBinaryClassifier"); } [Fact] public void EntryPointClassificationEnsemble() { - TestEntryPointRoutine("iris.txt", "Trainers.ClassificationEnsemble"); + TestEntryPointRoutine("iris.txt", "Trainers.EnsembleClassification"); } [Fact] public void EntryPointRegressionEnsemble() { - TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.RegressionEnsemble", loader: TestDatasets.winequality.loaderSettings); + TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.EnsembleRegression", loader: TestDatasets.winequality.loaderSettings); } [Fact] From 757f2f1238e9068d5a5d1c44c0d20331588c1f10 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 26 Jun 2018 15:16:40 -0700 Subject: [PATCH 18/20] fix dataset loadersettings --- test/Microsoft.ML.TestFramework/Datasets.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 272780021b..34b9dd49ea 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -165,7 +165,7 @@ public static class TestDatasets name = "wine", trainFilename = "external/winequality-white.csv", testFilename = "external/winequality-white.csv", - loaderSettings = "loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+}" + loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=; header+" }; public static TestDataset msm = new TestDataset From 17343f07e8343a67b451c3b03dfc6e7f2bfa20b5 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 26 Jun 2018 15:28:09 -0700 Subject: [PATCH 19/20] why I have to go through this pain? --- test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs | 2 +- test/Microsoft.ML.TestFramework/Datasets.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 5bceca7a94..b9500f675d 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1826,7 +1826,7 @@ public void EntryPointClassificationEnsemble() [Fact] public void EntryPointRegressionEnsemble() { - TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.EnsembleRegression", loader: TestDatasets.winequality.loaderSettings); + TestEntryPointRoutine(TestDatasets.winequalitymacro.trainFilename, "Trainers.EnsembleRegression", loader: TestDatasets.winequalitymacro.loaderSettings); } [Fact] diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 34b9dd49ea..272780021b 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -165,7 +165,7 @@ public static class TestDatasets name = "wine", trainFilename = "external/winequality-white.csv", testFilename = "external/winequality-white.csv", - loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=; header+" + loaderSettings = "loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+}" }; public static TestDataset msm = new TestDataset From 8f383cd69e0f0fe03ef5dbf109f53718abd9cbf8 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Tue, 26 Jun 2018 16:44:34 -0700 Subject: [PATCH 20/20] update manifest file --- .../Common/EntryPoints/core_manifest.json | 239 ++++++++++++++++++ 1 file changed, 239 insertions(+) diff --git a/test/BaselineOutput/Common/EntryPoints/core_manifest.json b/test/BaselineOutput/Common/EntryPoints/core_manifest.json index acea2339aa..0acb5971b0 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_manifest.json +++ b/test/BaselineOutput/Common/EntryPoints/core_manifest.json @@ -9660,6 +9660,245 @@ "ITrainerOutput" ] }, + { + "Name": "Trainers.FieldAwareFactorizationMachineBinaryClassifier", + "Desc": "Train a field-aware factorization machine for binary classification", + "FriendlyName": "Field-aware Factorization Machine", + "ShortName": "ffm", + "Inputs": [ + { + "Name": "LearningRate", + "Type": "Float", + "Desc": "Initial learning rate", + "Aliases": [ + "lr" + ], + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": 0.1, + "SweepRange": { + "RangeType": "Float", + "Min": 0.001, + "Max": 1.0, + "IsLogScale": true + } + }, + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Iters", + "Type": "Int", + "Desc": "Number of training iterations", + "Aliases": [ + "iter" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": 5, + "SweepRange": { + "RangeType": "Long", + "Min": 1, + "Max": 100 + } + }, + { + "Name": "FeatureColumn", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "LatentDim", + "Type": "Int", + "Desc": "Latent space dimension", + "Aliases": [ + "d" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": 20, + "SweepRange": { + "RangeType": "Long", + "Min": 4, + "Max": 100 + } + }, + { + "Name": "LabelColumn", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "LambdaLinear", + "Type": "Float", + "Desc": "Regularization coefficient of linear weights", + "Aliases": [ + "lambdaLinear" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 0.0001, + "SweepRange": { + "RangeType": "Float", + "Min": 1E-08, + "Max": 1.0, + "IsLogScale": true + } + }, + { + "Name": "LambdaLatent", + "Type": "Float", + "Desc": "Regularization coefficient of latent weights", + "Aliases": [ + "lambdaLatent" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": 0.0001, + "SweepRange": { + "RangeType": "Float", + "Min": 1E-08, + "Max": 1.0, + "IsLogScale": true + } + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Norm", + "Type": "Bool", + "Desc": "Whether to normalize the input vectors so that the concatenation of all fields' feature vectors is unit-length", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "Disk", + "None" + ] + }, + "Desc": "Whether learner should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Shuffle", + "Type": "Bool", + "Desc": "Whether to shuffle for each training iteration", + "Aliases": [ + "shuf" + ], + "Required": false, + "SortOrder": 90.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Verbose", + "Type": "Bool", + "Desc": "Report traning progress or not", + "Aliases": [ + "verbose" + ], + "Required": false, + "SortOrder": 91.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Radius", + "Type": "Float", + "Desc": "Radius of initial latent factors", + "Aliases": [ + "rad" + ], + "Required": false, + "SortOrder": 110.0, + "IsNullable": false, + "Default": 0.5, + "SweepRange": { + "RangeType": "Float", + "Min": 0.1, + "Max": 1.0 + } + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, { "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.",