From 77746d644b3718743980b03392d07a80e1aae7e7 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Sun, 3 Feb 2019 17:04:03 +0000 Subject: [PATCH 1/8] RandomizedPcaTrainer constructor made internal --- src/Microsoft.ML.PCA/PcaTrainer.cs | 26 +++++++++---------- .../Properties/AssemblyInfo.cs | 1 + .../Common/EntryPoints/core_ep-list.tsv | 2 +- .../UnitTests/TestEntryPoints.cs | 2 +- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index 878011609f..b42bd40995 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -19,7 +19,7 @@ using Microsoft.ML.Trainers.PCA; using Microsoft.ML.Training; -[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Arguments), +[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Options), new[] { typeof(SignatureAnomalyDetectorTrainer), typeof(SignatureTrainer) }, RandomizedPcaTrainer.UserNameValue, RandomizedPcaTrainer.LoadNameValue, @@ -49,7 +49,7 @@ public sealed class RandomizedPcaTrainer : TrainerEstimatorBaseOversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. /// The seed for random number generation. - public RandomizedPcaTrainer(IHostEnvironment env, + internal RandomizedPcaTrainer(IHostEnvironment env, string features, string weights = null, int rank = 20, @@ -103,23 +103,23 @@ public RandomizedPcaTrainer(IHostEnvironment env, } - internal RandomizedPcaTrainer(IHostEnvironment env, Arguments args) - :this(env, args, args.FeatureColumn, args.WeightColumn) + internal RandomizedPcaTrainer(IHostEnvironment env, Options options) + :this(env, options, options.FeatureColumn, options.WeightColumn) { } - private RandomizedPcaTrainer(IHostEnvironment env, Arguments args, string featureColumn, string weightColumn, + private RandomizedPcaTrainer(IHostEnvironment env, Options options, string featureColumn, string weightColumn, int rank = 20, int oversampling = 20, bool center = true, int? seed = null) : base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), TrainerUtils.MakeR4VecFeature(featureColumn), default, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn)) { // if the args are not null, we got here from maml, and the internal ctor. - if (args != null) + if (options != null) { - _rank = args.Rank; - _center = args.Center; - _oversampling = args.Oversampling; - _seed = args.Seed ?? Host.Rand.Next(); + _rank = options.Rank; + _center = options.Center; + _oversampling = options.Oversampling; + _seed = options.Seed ?? Host.Rand.Next(); } else { @@ -347,14 +347,14 @@ protected override AnomalyPredictionTransformer MakeTransfor Desc = "Train an PCA Anomaly model.", UserName = UserNameValue, ShortName = ShortName)] - internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input) + internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("TrainPCAAnomaly"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - return LearnerEntryPointsUtils.Train(host, input, + return LearnerEntryPointsUtils.Train(host, input, () => new RandomizedPcaTrainer(host, input), getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); } diff --git a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs index 76e82b8d0f..a4571c9c78 100644 --- a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs @@ -5,6 +5,7 @@ using System.Runtime.CompilerServices; using Microsoft.ML; +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)] diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 8525c932d4..bef05c602a 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -63,7 +63,7 @@ Trainers.LogisticRegressionClassifier Logistic Regression is a method in statist Trainers.NaiveBayesClassifier Train a MultiClassNaiveBayesTrainer. Microsoft.ML.Trainers.MultiClassNaiveBayesTrainer TrainMultiClassNaiveBayesTrainer Microsoft.ML.Trainers.MultiClassNaiveBayesTrainer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+MulticlassClassificationOutput Trainers.OnlineGradientDescentRegressor Train a Online gradient descent perceptron. Microsoft.ML.Trainers.Online.OnlineGradientDescentTrainer TrainRegression Microsoft.ML.Trainers.Online.OnlineGradientDescentTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput Trainers.OrdinaryLeastSquaresRegressor Train an OLS regression model. Microsoft.ML.Trainers.HalLearners.OlsLinearRegressionTrainer TrainRegression Microsoft.ML.Trainers.HalLearners.OlsLinearRegressionTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput -Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput +Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput Trainers.PoissonRegressor Train an Poisson regression model. Microsoft.ML.Trainers.PoissonRegression TrainRegression Microsoft.ML.Trainers.PoissonRegression+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput Trainers.StochasticDualCoordinateAscentBinaryClassifier Train an SDCA binary model. Microsoft.ML.Trainers.Sdca TrainBinary Microsoft.ML.Trainers.SdcaBinaryTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.StochasticDualCoordinateAscentClassifier The SDCA linear multi-class classification trainer. Microsoft.ML.Trainers.Sdca TrainMultiClass Microsoft.ML.Trainers.SdcaMultiClassTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+MulticlassClassificationOutput diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index a75ac92051..dcf3b7e3de 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -3417,7 +3417,7 @@ public void EntryPointPcaPredictorSummary() InputFile = inputFile, }).Data; - var pcaInput = new RandomizedPcaTrainer.Arguments + var pcaInput = new RandomizedPcaTrainer.Options { TrainingData = dataView, }; From e5b1a7443e7bbe444159ae742308446391f3daca Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 7 Feb 2019 23:04:49 +0000 Subject: [PATCH 2/8] MLCOntext for PCA --- src/Microsoft.ML.Data/MLContext.cs | 6 +++ src/Microsoft.ML.Data/TrainCatalog.cs | 25 +++++++++ src/Microsoft.ML.PCA/PCACatalog.cs | 23 +++++++- .../AnomalyDetectionTests.cs | 53 +++++++++++++++++++ 4 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 test/Microsoft.ML.Tests/AnomalyDetectionTests.cs diff --git a/src/Microsoft.ML.Data/MLContext.cs b/src/Microsoft.ML.Data/MLContext.cs index 75c2554570..66337a7ec9 100644 --- a/src/Microsoft.ML.Data/MLContext.cs +++ b/src/Microsoft.ML.Data/MLContext.cs @@ -40,6 +40,11 @@ public sealed class MLContext : IHostEnvironment /// public RankingCatalog Ranking { get; } + /// + /// Trainers and tasks specific to anomaly detection problems. + /// + public AnomalyDetectionCatalog AnomalyDetection { get; } + /// /// Data processing operations. /// @@ -83,6 +88,7 @@ public MLContext(int? seed = null, int conc = 0) Regression = new RegressionCatalog(_env); Clustering = new ClusteringCatalog(_env); Ranking = new RankingCatalog(_env); + AnomalyDetection = new AnomalyDetectionCatalog(_env); Transforms = new TransformsCatalog(_env); Model = new ModelOperationsCatalog(_env); Data = new DataOperationsCatalog(_env); diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index f10d5dda6e..15dfd66737 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -564,4 +564,29 @@ public RankerMetrics Evaluate(IDataView data, string label, string groupId, stri return eval.Evaluate(data, label, groupId, score); } } + + /// + /// The central catalog for anomaly detection tasks and trainers. + /// + public sealed class AnomalyDetectionCatalog : TrainCatalogBase + { + /// + /// The list of trainers for anomaly detection. + /// + public AnomalyDetectionTrainers Trainers { get; } + + internal AnomalyDetectionCatalog(IHostEnvironment env) + : base(env, nameof(AnomalyDetectionCatalog)) + { + Trainers = new AnomalyDetectionTrainers(this); + } + + public sealed class AnomalyDetectionTrainers : CatalogInstantiatorBase + { + internal AnomalyDetectionTrainers(AnomalyDetectionCatalog catalog) + : base(catalog) + { + } + } + } } diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index d60bf7d1d0..03063152a2 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -3,13 +3,14 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; +using Microsoft.ML.Trainers.PCA; using Microsoft.ML.Transforms.Projections; +using static Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer; namespace Microsoft.ML { public static class PcaCatalog { - /// Initializes a new instance of . /// The transform's catalog. /// Name of the column resulting from the transformation of . @@ -35,5 +36,25 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t /// Input columns to apply PrincipalComponentAnalysis on. public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) => new PrincipalComponentAnalysisEstimator(CatalogUtils.GetEnvironment(catalog), columns); + + public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, + string featureColumn = DefaultColumnNames.Features, + string weights = null, + int rank = 20, + int oversampling = 20, + bool center = true, + int? seed = null) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new RandomizedPcaTrainer(env, featureColumn, weights, rank, oversampling, center, seed); + } + + public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new RandomizedPcaTrainer(env, options); + } } } diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs new file mode 100644 index 0000000000..90a5dc125d --- /dev/null +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -0,0 +1,53 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Drawing; +using System.Drawing.Imaging; +using System.IO; +using System.Linq; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.ImageAnalytics; +using Microsoft.ML.Model; +using Microsoft.ML.RunTests; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Tests +{ + public class AnomalyDetectionTests : TestDataPipeBase + { + public AnomalyDetectionTests(ITestOutputHelper output) : base(output) + { + } + + /// + /// RandomizedPcaTrainer test + /// + [Fact] + public void RandomizedPcaTrainer() + { + var mlContext = new MLContext(seed: 1, conc: 1); + string featureColumn = "NumericFeatures"; + + var reader = new TextLoader(Env, new TextLoader.Arguments() + { + HasHeader = true, + Separator = "\t", + Columns = new[] + { + new TextLoader.Column(featureColumn, DataKind.R4, new [] { new TextLoader.Range(1, 784) }) + } + }); + + var data = reader.Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); + + var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumn); + + var transformer = pipeline.Fit(data); + var transformedData = transformer.Transform(data); + } + } +} From 8c47da15c76a489334e84b45492ce0e9a4d62104 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 7 Feb 2019 23:31:36 +0000 Subject: [PATCH 3/8] update test example --- test/Microsoft.ML.Tests/AnomalyDetectionTests.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs index 90a5dc125d..abc20d113d 100644 --- a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -42,12 +42,13 @@ public void RandomizedPcaTrainer() } }); - var data = reader.Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); + var trainData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); + var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumn); - var transformer = pipeline.Fit(data); - var transformedData = transformer.Transform(data); + var transformer = pipeline.Fit(trainData); + var transformedData = transformer.Transform(testData); } } } From 8adb8b13d88405a14911eec6488b817af6c180c1 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Sun, 10 Feb 2019 22:56:49 +0000 Subject: [PATCH 4/8] added evaluation metrics for anomaly detection --- .../Evaluators/AnomalyDetectionEvaluator.cs | 38 +++++++++++++++++++ .../Metrics/AnomalyDetectionMetrics.cs | 37 ++++++++++++++++++ src/Microsoft.ML.Data/TrainCatalog.cs | 23 +++++++++++ .../AnomalyDetectionTests.cs | 15 +++++++- 4 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs diff --git a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs index 1878fa9d0c..f36913e196 100644 --- a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs +++ b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs @@ -10,6 +10,7 @@ using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; +using Microsoft.ML.Data.Evaluators.Metrics; using Microsoft.ML.EntryPoints; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Transforms; @@ -576,6 +577,43 @@ public void Finish() FinishOtherMetrics(); } } + + /// + /// Evaluates scored anomaly detection data. + /// + /// The scored data. + /// The name of the label column in . + /// The name of the score column in . + /// The name of the predicted label column in . + /// The evaluation results for these outputs. + public AnomalyDetectionMetrics Evaluate(IDataView data, string label, string score, string predictedLabel) + { + Host.CheckValue(data, nameof(data)); + Host.CheckNonEmpty(label, nameof(label)); + Host.CheckNonEmpty(score, nameof(score)); + Host.CheckNonEmpty(predictedLabel, nameof(predictedLabel)); + + var roles = new RoleMappedData(data, opt: false, + RoleMappedSchema.ColumnRole.Label.Bind(label), + RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Score, score), + RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.PredictedLabel, predictedLabel)); + + var resultDict = ((IEvaluator)this).Evaluate(roles); + Host.Assert(resultDict.ContainsKey(MetricKinds.OverallMetrics)); + var overall = resultDict[MetricKinds.OverallMetrics]; + + AnomalyDetectionMetrics result; + using (var cursor = overall.GetRowCursorForAllColumns()) + { + var moved = cursor.MoveNext(); + Host.Assert(moved); + result = new AnomalyDetectionMetrics(Host, cursor); + moved = cursor.MoveNext(); + Host.Assert(!moved); + } + return result; + } + } [BestFriend] diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs new file mode 100644 index 0000000000..91c9f6dcb4 --- /dev/null +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs @@ -0,0 +1,37 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Data.DataView; + +namespace Microsoft.ML.Data.Evaluators.Metrics +{ + public sealed class AnomalyDetectionMetrics + { + public double Auc { get; } + public double DrAtK { get; } + public double DrAtPFpr { get; } + public double DrAtNumPos { get; } + public double NumAnomalies { get; } + public double ThreshAtK { get; } + public double ThreshAtP { get; } + public double ThreshAtNumPos { get; } + + internal AnomalyDetectionMetrics(IExceptionContext ectx, Row overallResult) + { + long FetchInt(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); + float FetchFloat(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); + double FetchDouble(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); + + Auc = FetchDouble(BinaryClassifierEvaluator.Auc); + DrAtK = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtK); + DrAtPFpr = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtPFpr); + DrAtNumPos = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtNumPos); + NumAnomalies = FetchInt(AnomalyDetectionEvaluator.OverallMetrics.NumAnomalies); + ThreshAtK = FetchFloat(AnomalyDetectionEvaluator.OverallMetrics.ThreshAtK); + ThreshAtP = FetchFloat(AnomalyDetectionEvaluator.OverallMetrics.ThreshAtP); + ThreshAtNumPos = FetchFloat(AnomalyDetectionEvaluator.OverallMetrics.ThreshAtNumPos); + } + } +} diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index 776864733f..b7f25792dd 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -8,6 +8,7 @@ using Microsoft.Data.DataView; using Microsoft.ML.Core.Data; using Microsoft.ML.Data; +using Microsoft.ML.Data.Evaluators.Metrics; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Conversions; @@ -588,5 +589,27 @@ internal AnomalyDetectionTrainers(AnomalyDetectionCatalog catalog) { } } + + /// + /// Evaluates scored anomaly detection data. + /// + /// The scored data. + /// The name of the label column in . + /// The name of the score column in . + /// The name of the predicted label column in . + /// The evaluation results for these calibrated outputs. + public AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score, + string predictedLabel = DefaultColumnNames.PredictedLabel) + { + Host.CheckValue(data, nameof(data)); + Host.CheckNonEmpty(label, nameof(label)); + Host.CheckNonEmpty(score, nameof(score)); + Host.CheckNonEmpty(predictedLabel, nameof(predictedLabel)); + + var args = new AnomalyDetectionEvaluator.Arguments(); + + var eval = new AnomalyDetectionEvaluator(Host, args); + return eval.Evaluate(data, label, score, predictedLabel); + } } } diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs index abc20d113d..595081726f 100644 --- a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -38,6 +38,7 @@ public void RandomizedPcaTrainer() Separator = "\t", Columns = new[] { + new TextLoader.Column("Label", DataKind.R4, 0), new TextLoader.Column(featureColumn, DataKind.R4, new [] { new TextLoader.Range(1, 784) }) } }); @@ -45,10 +46,22 @@ public void RandomizedPcaTrainer() var trainData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); - var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumn); + var pipeline = ML.AnomalyDetection.Trainers.RandomizedPca(featureColumn); var transformer = pipeline.Fit(trainData); var transformedData = transformer.Transform(testData); + + // Evaluate + var metrics = ML.AnomalyDetection.Evaluate(transformedData); + + Assert.Equal(0.99, metrics.Auc, 2); + Assert.Equal(0.90, metrics.DrAtK, 2); + Assert.Equal(0.90, metrics.DrAtPFpr, 2); + Assert.Equal(0.90, metrics.DrAtNumPos, 2); + Assert.Equal(10, metrics.NumAnomalies); + Assert.Equal(0.57, metrics.ThreshAtK, 2); + Assert.Equal(0.63, metrics.ThreshAtP, 2); + Assert.Equal(0.65, metrics.ThreshAtNumPos, 2); } } } From 62f5db859ebb68f9e45d97c9c91e8fcfa3e969f5 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Mon, 11 Feb 2019 09:11:04 +0000 Subject: [PATCH 5/8] make tests work. it seems adding a catalog to MLContext changes some seeds? --- test/BaselineOutput/SingleDebug/Rff/featurized.tsv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/BaselineOutput/SingleDebug/Rff/featurized.tsv b/test/BaselineOutput/SingleDebug/Rff/featurized.tsv index 313d076a9e..90fcbc2cdd 100644 --- a/test/BaselineOutput/SingleDebug/Rff/featurized.tsv +++ b/test/BaselineOutput/SingleDebug/Rff/featurized.tsv @@ -6,7 +6,7 @@ #@ col=RffVectorFloat:R4:9-14 #@ } 15 8:Label -5 1 1 1 2 1 3 1 0 -0.157029659 -0.555585265 0.490177631 -0.305056125 0.35670203 -0.453979075 -5 4 4 5 7 10 3 2 0 -0.375955045 0.43816793 -0.5670244 0.108704455 -0.271485656 -0.5095379 -3 1 1 1 2 2 3 1 0 -0.08380841 -0.571235061 0.4856296 -0.312245429 0.389987826 -0.4257262 -6 8 8 1 3 4 3 7 0 -0.2813567 0.504154444 -0.266616732 -0.512102365 -0.5723418 -0.07588247 +5 1 1 1 2 1 3 1 0 0.494028777 0.298778981 0.533874154 -0.219799235 0.5505202 -0.173956484 +5 4 4 5 7 10 3 2 0 0.363161922 -0.4488282 0.5746647 -0.0556217246 0.276754946 0.5066952 +3 1 1 1 2 2 3 1 0 0.5167663 0.257460445 0.5747438 -0.05479813 0.5746628 0.0556417964 +6 8 8 1 3 4 3 7 0 -0.5738443 -0.0635295138 0.3556944 -0.454768956 -0.4006303 -0.415726721 From 02b8651f61e5d61328e8ed90c6d28ae71f8f0117 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Mon, 11 Feb 2019 18:44:49 +0000 Subject: [PATCH 6/8] also updating baseline file for Release builds --- test/BaselineOutput/SingleRelease/Rff/featurized.tsv | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/BaselineOutput/SingleRelease/Rff/featurized.tsv b/test/BaselineOutput/SingleRelease/Rff/featurized.tsv index 313d076a9e..90fcbc2cdd 100644 --- a/test/BaselineOutput/SingleRelease/Rff/featurized.tsv +++ b/test/BaselineOutput/SingleRelease/Rff/featurized.tsv @@ -6,7 +6,7 @@ #@ col=RffVectorFloat:R4:9-14 #@ } 15 8:Label -5 1 1 1 2 1 3 1 0 -0.157029659 -0.555585265 0.490177631 -0.305056125 0.35670203 -0.453979075 -5 4 4 5 7 10 3 2 0 -0.375955045 0.43816793 -0.5670244 0.108704455 -0.271485656 -0.5095379 -3 1 1 1 2 2 3 1 0 -0.08380841 -0.571235061 0.4856296 -0.312245429 0.389987826 -0.4257262 -6 8 8 1 3 4 3 7 0 -0.2813567 0.504154444 -0.266616732 -0.512102365 -0.5723418 -0.07588247 +5 1 1 1 2 1 3 1 0 0.494028777 0.298778981 0.533874154 -0.219799235 0.5505202 -0.173956484 +5 4 4 5 7 10 3 2 0 0.363161922 -0.4488282 0.5746647 -0.0556217246 0.276754946 0.5066952 +3 1 1 1 2 2 3 1 0 0.5167663 0.257460445 0.5747438 -0.05479813 0.5746628 0.0556417964 +6 8 8 1 3 4 3 7 0 -0.5738443 -0.0635295138 0.3556944 -0.454768956 -0.4006303 -0.415726721 From b82c3261edcc244a8e82fb8786907cb6737683d7 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Mon, 11 Feb 2019 21:32:45 +0000 Subject: [PATCH 7/8] review comments --- .../Evaluators/AnomalyDetectionEvaluator.cs | 3 +- .../Metrics/AnomalyDetectionMetrics.cs | 31 +++++++++++++------ src/Microsoft.ML.PCA/PCACatalog.cs | 21 +++++++++++-- src/Microsoft.ML.PCA/PcaTrainer.cs | 19 ++++++++---- .../AnomalyDetectionTests.cs | 4 --- 5 files changed, 54 insertions(+), 24 deletions(-) diff --git a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs index f36913e196..cfd0f5fd29 100644 --- a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs +++ b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs @@ -586,7 +586,8 @@ public void Finish() /// The name of the score column in . /// The name of the predicted label column in . /// The evaluation results for these outputs. - public AnomalyDetectionMetrics Evaluate(IDataView data, string label, string score, string predictedLabel) + internal AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score, + string predictedLabel = DefaultColumnNames.PredictedLabel) { Host.CheckValue(data, nameof(data)); Host.CheckNonEmpty(label, nameof(label)); diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs index 91c9f6dcb4..4fa81a63c7 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs @@ -7,31 +7,42 @@ namespace Microsoft.ML.Data.Evaluators.Metrics { + /// + /// Evaluation results for anomaly detection. + /// public sealed class AnomalyDetectionMetrics { + /// + /// Gets the area under the ROC curve. + /// + /// + /// The area under the ROC curve is equal to the probability that the algorithm ranks + /// a randomly chosen positive instance higher than a randomly chosen negative one + /// (assuming 'positive' ranks higher than 'negative'). + /// public double Auc { get; } + + /// + /// Detection rate at k false positives. + /// public double DrAtK { get; } + /// + /// Detection rate at fraction p false positives. + /// public double DrAtPFpr { get; } + /// + /// Detection rate at number of anomalies. + /// public double DrAtNumPos { get; } - public double NumAnomalies { get; } - public double ThreshAtK { get; } - public double ThreshAtP { get; } - public double ThreshAtNumPos { get; } internal AnomalyDetectionMetrics(IExceptionContext ectx, Row overallResult) { - long FetchInt(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); - float FetchFloat(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); double FetchDouble(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); Auc = FetchDouble(BinaryClassifierEvaluator.Auc); DrAtK = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtK); DrAtPFpr = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtPFpr); DrAtNumPos = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtNumPos); - NumAnomalies = FetchInt(AnomalyDetectionEvaluator.OverallMetrics.NumAnomalies); - ThreshAtK = FetchFloat(AnomalyDetectionEvaluator.OverallMetrics.ThreshAtK); - ThreshAtP = FetchFloat(AnomalyDetectionEvaluator.OverallMetrics.ThreshAtP); - ThreshAtNumPos = FetchFloat(AnomalyDetectionEvaluator.OverallMetrics.ThreshAtNumPos); } } } diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index 03063152a2..696bf1f741 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -37,12 +37,22 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) => new PrincipalComponentAnalysisEstimator(CatalogUtils.GetEnvironment(catalog), columns); + /// + /// Trains an approximate PCA using Randomized SVD algorithm. + /// + /// The anomaly detection catalog trainer object. + /// The features, or independent variables. + /// The optional example weights. + /// The number of components in the PCA. + /// Oversampling parameter for randomized PCA training. + /// If enabled, data is centered to be zero mean. + /// The seed for random number generation. public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, string featureColumn = DefaultColumnNames.Features, string weights = null, - int rank = 20, - int oversampling = 20, - bool center = true, + int rank = Options.Defaults.NumComponents, + int oversampling = Options.Defaults.OversamplingParameters, + bool center = Options.Defaults.IsCenteredZeroMean, int? seed = null) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -50,6 +60,11 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An return new RandomizedPcaTrainer(env, featureColumn, weights, rank, oversampling, center, seed); } + /// + /// Trains an approximate PCA using Randomized SVD algorithm. + /// + /// The anomaly detection catalog trainer object. + /// Advanced options to the algorithm. public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options) { Contracts.CheckValue(catalog, nameof(catalog)); diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index b42bd40995..e247d6f500 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -54,19 +54,26 @@ public class Options : UnsupervisedLearnerInputBaseWithWeight [Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k", SortOrder = 50)] [TGUI(SuggestedSweeps = "10,20,40,80")] [TlcModule.SweepableDiscreteParam("Rank", new object[] { 10, 20, 40, 80 })] - public int Rank = 20; + public int Rank = Defaults.NumComponents; [Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", SortOrder = 50)] [TGUI(SuggestedSweeps = "10,20,40")] [TlcModule.SweepableDiscreteParam("Oversampling", new object[] { 10, 20, 40 })] - public int Oversampling = 20; + public int Oversampling = Defaults.OversamplingParameters; [Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", ShortName = "center")] [TlcModule.SweepableDiscreteParam("Center", null, isBool: true)] - public bool Center = true; + public bool Center = Defaults.IsCenteredZeroMean; [Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")] public int? Seed; + + internal static class Defaults + { + public const int NumComponents = 20; + public const int OversamplingParameters = 20; + public const bool IsCenteredZeroMean = true; + } } private readonly int _rank; @@ -94,9 +101,9 @@ public class Options : UnsupervisedLearnerInputBaseWithWeight internal RandomizedPcaTrainer(IHostEnvironment env, string features, string weights = null, - int rank = 20, - int oversampling = 20, - bool center = true, + int rank = Options.Defaults.NumComponents, + int oversampling = Options.Defaults.OversamplingParameters, + bool center = Options.Defaults.IsCenteredZeroMean, int? seed = null) : this(env, null, features, weights, rank, oversampling, center, seed) { diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs index 595081726f..9ac657dd0f 100644 --- a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -58,10 +58,6 @@ public void RandomizedPcaTrainer() Assert.Equal(0.90, metrics.DrAtK, 2); Assert.Equal(0.90, metrics.DrAtPFpr, 2); Assert.Equal(0.90, metrics.DrAtNumPos, 2); - Assert.Equal(10, metrics.NumAnomalies); - Assert.Equal(0.57, metrics.ThreshAtK, 2); - Assert.Equal(0.63, metrics.ThreshAtP, 2); - Assert.Equal(0.65, metrics.ThreshAtNumPos, 2); } } } From b1526d1b2445f59c1daf9612a4ac917300fe9229 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Tue, 12 Feb 2019 22:17:35 +0000 Subject: [PATCH 8/8] taking care of review comments --- .../Metrics/AnomalyDetectionMetrics.cs | 22 +++++++++---------- src/Microsoft.ML.Data/TrainCatalog.cs | 6 +++-- .../AnomalyDetectionTests.cs | 6 ++--- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs index 4fa81a63c7..7401fd422d 100644 --- a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs @@ -23,26 +23,24 @@ public sealed class AnomalyDetectionMetrics public double Auc { get; } /// - /// Detection rate at k false positives. + /// Detection rate at K false positives. /// + /// + /// This is computed as follows: + /// 1.Sort the test examples by the output of the anomaly detector in descending order of scores. + /// 2.Among the top K False Positives, compute ratio : (True Positive @ K) / (Total anomalies in test data) + /// Example confusion matrix for anomaly detection: + /// Anomalies (in test data) | Non-Anomalies (in test data) + /// Predicted Anomalies : TP | FP + /// Predicted Non-Anomalies : FN | TN + /// public double DrAtK { get; } - /// - /// Detection rate at fraction p false positives. - /// - public double DrAtPFpr { get; } - /// - /// Detection rate at number of anomalies. - /// - public double DrAtNumPos { get; } internal AnomalyDetectionMetrics(IExceptionContext ectx, Row overallResult) { double FetchDouble(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); - Auc = FetchDouble(BinaryClassifierEvaluator.Auc); DrAtK = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtK); - DrAtPFpr = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtPFpr); - DrAtNumPos = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtNumPos); } } } diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index b7f25792dd..cd2b886423 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -597,9 +597,10 @@ internal AnomalyDetectionTrainers(AnomalyDetectionCatalog catalog) /// The name of the label column in . /// The name of the score column in . /// The name of the predicted label column in . - /// The evaluation results for these calibrated outputs. + /// The number of false positives to compute the metric. + /// Evaluation results. public AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score, - string predictedLabel = DefaultColumnNames.PredictedLabel) + string predictedLabel = DefaultColumnNames.PredictedLabel, int k = 10) { Host.CheckValue(data, nameof(data)); Host.CheckNonEmpty(label, nameof(label)); @@ -607,6 +608,7 @@ public AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultCo Host.CheckNonEmpty(predictedLabel, nameof(predictedLabel)); var args = new AnomalyDetectionEvaluator.Arguments(); + args.K = k; var eval = new AnomalyDetectionEvaluator(Host, args); return eval.Evaluate(data, label, score, predictedLabel); diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs index 9ac657dd0f..d8335eaa26 100644 --- a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -52,12 +52,10 @@ public void RandomizedPcaTrainer() var transformedData = transformer.Transform(testData); // Evaluate - var metrics = ML.AnomalyDetection.Evaluate(transformedData); + var metrics = ML.AnomalyDetection.Evaluate(transformedData, k: 10); - Assert.Equal(0.99, metrics.Auc, 2); + Assert.Equal(0.98558, metrics.Auc, 5); Assert.Equal(0.90, metrics.DrAtK, 2); - Assert.Equal(0.90, metrics.DrAtPFpr, 2); - Assert.Equal(0.90, metrics.DrAtNumPos, 2); } } }