diff --git a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs index 1878fa9d0c..cfd0f5fd29 100644 --- a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs +++ b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs @@ -10,6 +10,7 @@ using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; +using Microsoft.ML.Data.Evaluators.Metrics; using Microsoft.ML.EntryPoints; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Transforms; @@ -576,6 +577,44 @@ public void Finish() FinishOtherMetrics(); } } + + /// + /// Evaluates scored anomaly detection data. + /// + /// The scored data. + /// The name of the label column in . + /// The name of the score column in . + /// The name of the predicted label column in . + /// The evaluation results for these outputs. + internal AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score, + string predictedLabel = DefaultColumnNames.PredictedLabel) + { + Host.CheckValue(data, nameof(data)); + Host.CheckNonEmpty(label, nameof(label)); + Host.CheckNonEmpty(score, nameof(score)); + Host.CheckNonEmpty(predictedLabel, nameof(predictedLabel)); + + var roles = new RoleMappedData(data, opt: false, + RoleMappedSchema.ColumnRole.Label.Bind(label), + RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Score, score), + RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.PredictedLabel, predictedLabel)); + + var resultDict = ((IEvaluator)this).Evaluate(roles); + Host.Assert(resultDict.ContainsKey(MetricKinds.OverallMetrics)); + var overall = resultDict[MetricKinds.OverallMetrics]; + + AnomalyDetectionMetrics result; + using (var cursor = overall.GetRowCursorForAllColumns()) + { + var moved = cursor.MoveNext(); + Host.Assert(moved); + result = new AnomalyDetectionMetrics(Host, cursor); + moved = cursor.MoveNext(); + Host.Assert(!moved); + } + return result; + } + } [BestFriend] diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs new file mode 100644 index 0000000000..7401fd422d --- /dev/null +++ b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs @@ -0,0 +1,46 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using Microsoft.Data.DataView; + +namespace Microsoft.ML.Data.Evaluators.Metrics +{ + /// + /// Evaluation results for anomaly detection. + /// + public sealed class AnomalyDetectionMetrics + { + /// + /// Gets the area under the ROC curve. + /// + /// + /// The area under the ROC curve is equal to the probability that the algorithm ranks + /// a randomly chosen positive instance higher than a randomly chosen negative one + /// (assuming 'positive' ranks higher than 'negative'). + /// + public double Auc { get; } + + /// + /// Detection rate at K false positives. + /// + /// + /// This is computed as follows: + /// 1.Sort the test examples by the output of the anomaly detector in descending order of scores. + /// 2.Among the top K False Positives, compute ratio : (True Positive @ K) / (Total anomalies in test data) + /// Example confusion matrix for anomaly detection: + /// Anomalies (in test data) | Non-Anomalies (in test data) + /// Predicted Anomalies : TP | FP + /// Predicted Non-Anomalies : FN | TN + /// + public double DrAtK { get; } + + internal AnomalyDetectionMetrics(IExceptionContext ectx, Row overallResult) + { + double FetchDouble(string name) => RowCursorUtils.Fetch(ectx, overallResult, name); + Auc = FetchDouble(BinaryClassifierEvaluator.Auc); + DrAtK = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtK); + } + } +} diff --git a/src/Microsoft.ML.Data/MLContext.cs b/src/Microsoft.ML.Data/MLContext.cs index 75c2554570..66337a7ec9 100644 --- a/src/Microsoft.ML.Data/MLContext.cs +++ b/src/Microsoft.ML.Data/MLContext.cs @@ -40,6 +40,11 @@ public sealed class MLContext : IHostEnvironment /// public RankingCatalog Ranking { get; } + /// + /// Trainers and tasks specific to anomaly detection problems. + /// + public AnomalyDetectionCatalog AnomalyDetection { get; } + /// /// Data processing operations. /// @@ -83,6 +88,7 @@ public MLContext(int? seed = null, int conc = 0) Regression = new RegressionCatalog(_env); Clustering = new ClusteringCatalog(_env); Ranking = new RankingCatalog(_env); + AnomalyDetection = new AnomalyDetectionCatalog(_env); Transforms = new TransformsCatalog(_env); Model = new ModelOperationsCatalog(_env); Data = new DataOperationsCatalog(_env); diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index ef170f448e..2c95b76519 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -6,6 +6,7 @@ using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Data.Evaluators.Metrics; using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Conversions; @@ -646,4 +647,53 @@ public RankerMetrics Evaluate(IDataView data, string label, string groupId, stri return eval.Evaluate(data, label, groupId, score); } } + + /// + /// The central catalog for anomaly detection tasks and trainers. + /// + public sealed class AnomalyDetectionCatalog : TrainCatalogBase + { + /// + /// The list of trainers for anomaly detection. + /// + public AnomalyDetectionTrainers Trainers { get; } + + internal AnomalyDetectionCatalog(IHostEnvironment env) + : base(env, nameof(AnomalyDetectionCatalog)) + { + Trainers = new AnomalyDetectionTrainers(this); + } + + public sealed class AnomalyDetectionTrainers : CatalogInstantiatorBase + { + internal AnomalyDetectionTrainers(AnomalyDetectionCatalog catalog) + : base(catalog) + { + } + } + + /// + /// Evaluates scored anomaly detection data. + /// + /// The scored data. + /// The name of the label column in . + /// The name of the score column in . + /// The name of the predicted label column in . + /// The number of false positives to compute the metric. + /// Evaluation results. + public AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score, + string predictedLabel = DefaultColumnNames.PredictedLabel, int k = 10) + { + Environment.CheckValue(data, nameof(data)); + Environment.CheckNonEmpty(label, nameof(label)); + Environment.CheckNonEmpty(score, nameof(score)); + Environment.CheckNonEmpty(predictedLabel, nameof(predictedLabel)); + + var args = new AnomalyDetectionEvaluator.Arguments(); + args.K = k; + + var eval = new AnomalyDetectionEvaluator(Environment, args); + return eval.Evaluate(data, label, score, predictedLabel); + } + } } diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index d60bf7d1d0..696bf1f741 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -3,13 +3,14 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; +using Microsoft.ML.Trainers.PCA; using Microsoft.ML.Transforms.Projections; +using static Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer; namespace Microsoft.ML { public static class PcaCatalog { - /// Initializes a new instance of . /// The transform's catalog. /// Name of the column resulting from the transformation of . @@ -35,5 +36,40 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t /// Input columns to apply PrincipalComponentAnalysis on. public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) => new PrincipalComponentAnalysisEstimator(CatalogUtils.GetEnvironment(catalog), columns); + + /// + /// Trains an approximate PCA using Randomized SVD algorithm. + /// + /// The anomaly detection catalog trainer object. + /// The features, or independent variables. + /// The optional example weights. + /// The number of components in the PCA. + /// Oversampling parameter for randomized PCA training. + /// If enabled, data is centered to be zero mean. + /// The seed for random number generation. + public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, + string featureColumn = DefaultColumnNames.Features, + string weights = null, + int rank = Options.Defaults.NumComponents, + int oversampling = Options.Defaults.OversamplingParameters, + bool center = Options.Defaults.IsCenteredZeroMean, + int? seed = null) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new RandomizedPcaTrainer(env, featureColumn, weights, rank, oversampling, center, seed); + } + + /// + /// Trains an approximate PCA using Randomized SVD algorithm. + /// + /// The anomaly detection catalog trainer object. + /// Advanced options to the algorithm. + public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new RandomizedPcaTrainer(env, options); + } } } diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index 78ccf43ffb..f406b304f7 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -18,7 +18,7 @@ using Microsoft.ML.Trainers.PCA; using Microsoft.ML.Training; -[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Arguments), +[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Options), new[] { typeof(SignatureAnomalyDetectorTrainer), typeof(SignatureTrainer) }, RandomizedPcaTrainer.UserNameValue, RandomizedPcaTrainer.LoadNameValue, @@ -48,24 +48,31 @@ public sealed class RandomizedPcaTrainer : TrainerEstimatorBaseOversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. /// The seed for random number generation. - public RandomizedPcaTrainer(IHostEnvironment env, + internal RandomizedPcaTrainer(IHostEnvironment env, string features, string weights = null, - int rank = 20, - int oversampling = 20, - bool center = true, + int rank = Options.Defaults.NumComponents, + int oversampling = Options.Defaults.OversamplingParameters, + bool center = Options.Defaults.IsCenteredZeroMean, int? seed = null) : this(env, null, features, weights, rank, oversampling, center, seed) { } - internal RandomizedPcaTrainer(IHostEnvironment env, Arguments args) - :this(env, args, args.FeatureColumn, args.WeightColumn) + internal RandomizedPcaTrainer(IHostEnvironment env, Options options) + :this(env, options, options.FeatureColumn, options.WeightColumn) { } - private RandomizedPcaTrainer(IHostEnvironment env, Arguments args, string featureColumn, string weightColumn, + private RandomizedPcaTrainer(IHostEnvironment env, Options options, string featureColumn, string weightColumn, int rank = 20, int oversampling = 20, bool center = true, int? seed = null) : base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), TrainerUtils.MakeR4VecFeature(featureColumn), default, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn)) { // if the args are not null, we got here from maml, and the internal ctor. - if (args != null) + if (options != null) { - _rank = args.Rank; - _center = args.Center; - _oversampling = args.Oversampling; - _seed = args.Seed ?? Host.Rand.Next(); + _rank = options.Rank; + _center = options.Center; + _oversampling = options.Oversampling; + _seed = options.Seed ?? Host.Rand.Next(); } else { @@ -346,14 +353,14 @@ protected override AnomalyPredictionTransformer MakeTransfor Desc = "Train an PCA Anomaly model.", UserName = UserNameValue, ShortName = ShortName)] - internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input) + internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("TrainPCAAnomaly"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - return LearnerEntryPointsUtils.Train(host, input, + return LearnerEntryPointsUtils.Train(host, input, () => new RandomizedPcaTrainer(host, input), getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); } diff --git a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs index 76e82b8d0f..a4571c9c78 100644 --- a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs @@ -5,6 +5,7 @@ using System.Runtime.CompilerServices; using Microsoft.ML; +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)] diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 34822a5f0f..039551b034 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -63,7 +63,7 @@ Trainers.LogisticRegressionClassifier Logistic Regression is a method in statist Trainers.NaiveBayesClassifier Train a MultiClassNaiveBayesTrainer. Microsoft.ML.Trainers.MultiClassNaiveBayesTrainer TrainMultiClassNaiveBayesTrainer Microsoft.ML.Trainers.MultiClassNaiveBayesTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+MulticlassClassificationOutput Trainers.OnlineGradientDescentRegressor Train a Online gradient descent perceptron. Microsoft.ML.Trainers.Online.OnlineGradientDescentTrainer TrainRegression Microsoft.ML.Trainers.Online.OnlineGradientDescentTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput Trainers.OrdinaryLeastSquaresRegressor Train an OLS regression model. Microsoft.ML.Trainers.HalLearners.OlsLinearRegressionTrainer TrainRegression Microsoft.ML.Trainers.HalLearners.OlsLinearRegressionTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput -Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput +Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput Trainers.PoissonRegressor Train an Poisson regression model. Microsoft.ML.Trainers.PoissonRegression TrainRegression Microsoft.ML.Trainers.PoissonRegression+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput Trainers.StochasticDualCoordinateAscentBinaryClassifier Train an SDCA binary model. Microsoft.ML.Trainers.Sdca TrainBinary Microsoft.ML.Trainers.SdcaBinaryTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.StochasticDualCoordinateAscentClassifier The SDCA linear multi-class classification trainer. Microsoft.ML.Trainers.Sdca TrainMultiClass Microsoft.ML.Trainers.SdcaMultiClassTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+MulticlassClassificationOutput diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index eec14eb56b..2d82b68bbc 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -3414,7 +3414,7 @@ public void EntryPointPcaPredictorSummary() InputFile = inputFile, }).Data; - var pcaInput = new RandomizedPcaTrainer.Arguments + var pcaInput = new RandomizedPcaTrainer.Options { TrainingData = dataView, }; diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs new file mode 100644 index 0000000000..251b3cc611 --- /dev/null +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -0,0 +1,61 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Drawing; +using System.Drawing.Imaging; +using System.IO; +using System.Linq; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; +using Microsoft.ML.ImageAnalytics; +using Microsoft.ML.Model; +using Microsoft.ML.RunTests; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Tests +{ + public class AnomalyDetectionTests : TestDataPipeBase + { + public AnomalyDetectionTests(ITestOutputHelper output) : base(output) + { + } + + /// + /// RandomizedPcaTrainer test + /// + [Fact] + public void RandomizedPcaTrainerBaselineTest() + { + var mlContext = new MLContext(seed: 1, conc: 1); + string featureColumn = "NumericFeatures"; + + var reader = new TextLoader(Env, new TextLoader.Arguments() + { + HasHeader = true, + Separator = "\t", + Columns = new[] + { + new TextLoader.Column("Label", DataKind.R4, 0), + new TextLoader.Column(featureColumn, DataKind.R4, new [] { new TextLoader.Range(1, 784) }) + } + }); + + var trainData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename)); + var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); + + var pipeline = ML.AnomalyDetection.Trainers.RandomizedPca(featureColumn); + + var transformer = pipeline.Fit(trainData); + var transformedData = transformer.Transform(testData); + + // Evaluate + var metrics = ML.AnomalyDetection.Evaluate(transformedData, k: 5); + + Assert.Equal(0.98269, metrics.Auc, 5); + Assert.Equal(0.90000, metrics.DrAtK, 5); + } + } +}