diff --git a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs
index 1878fa9d0c..cfd0f5fd29 100644
--- a/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs
+++ b/src/Microsoft.ML.Data/Evaluators/AnomalyDetectionEvaluator.cs
@@ -10,6 +10,7 @@
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
+using Microsoft.ML.Data.Evaluators.Metrics;
using Microsoft.ML.EntryPoints;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Transforms;
@@ -576,6 +577,44 @@ public void Finish()
FinishOtherMetrics();
}
}
+
+ ///
+ /// Evaluates scored anomaly detection data.
+ ///
+ /// The scored data.
+ /// The name of the label column in .
+ /// The name of the score column in .
+ /// The name of the predicted label column in .
+ /// The evaluation results for these outputs.
+ internal AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score,
+ string predictedLabel = DefaultColumnNames.PredictedLabel)
+ {
+ Host.CheckValue(data, nameof(data));
+ Host.CheckNonEmpty(label, nameof(label));
+ Host.CheckNonEmpty(score, nameof(score));
+ Host.CheckNonEmpty(predictedLabel, nameof(predictedLabel));
+
+ var roles = new RoleMappedData(data, opt: false,
+ RoleMappedSchema.ColumnRole.Label.Bind(label),
+ RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.Score, score),
+ RoleMappedSchema.CreatePair(MetadataUtils.Const.ScoreValueKind.PredictedLabel, predictedLabel));
+
+ var resultDict = ((IEvaluator)this).Evaluate(roles);
+ Host.Assert(resultDict.ContainsKey(MetricKinds.OverallMetrics));
+ var overall = resultDict[MetricKinds.OverallMetrics];
+
+ AnomalyDetectionMetrics result;
+ using (var cursor = overall.GetRowCursorForAllColumns())
+ {
+ var moved = cursor.MoveNext();
+ Host.Assert(moved);
+ result = new AnomalyDetectionMetrics(Host, cursor);
+ moved = cursor.MoveNext();
+ Host.Assert(!moved);
+ }
+ return result;
+ }
+
}
[BestFriend]
diff --git a/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs
new file mode 100644
index 0000000000..7401fd422d
--- /dev/null
+++ b/src/Microsoft.ML.Data/Evaluators/Metrics/AnomalyDetectionMetrics.cs
@@ -0,0 +1,46 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using Microsoft.Data.DataView;
+
+namespace Microsoft.ML.Data.Evaluators.Metrics
+{
+ ///
+ /// Evaluation results for anomaly detection.
+ ///
+ public sealed class AnomalyDetectionMetrics
+ {
+ ///
+ /// Gets the area under the ROC curve.
+ ///
+ ///
+ /// The area under the ROC curve is equal to the probability that the algorithm ranks
+ /// a randomly chosen positive instance higher than a randomly chosen negative one
+ /// (assuming 'positive' ranks higher than 'negative').
+ ///
+ public double Auc { get; }
+
+ ///
+ /// Detection rate at K false positives.
+ ///
+ ///
+ /// This is computed as follows:
+ /// 1.Sort the test examples by the output of the anomaly detector in descending order of scores.
+ /// 2.Among the top K False Positives, compute ratio : (True Positive @ K) / (Total anomalies in test data)
+ /// Example confusion matrix for anomaly detection:
+ /// Anomalies (in test data) | Non-Anomalies (in test data)
+ /// Predicted Anomalies : TP | FP
+ /// Predicted Non-Anomalies : FN | TN
+ ///
+ public double DrAtK { get; }
+
+ internal AnomalyDetectionMetrics(IExceptionContext ectx, Row overallResult)
+ {
+ double FetchDouble(string name) => RowCursorUtils.Fetch(ectx, overallResult, name);
+ Auc = FetchDouble(BinaryClassifierEvaluator.Auc);
+ DrAtK = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtK);
+ }
+ }
+}
diff --git a/src/Microsoft.ML.Data/MLContext.cs b/src/Microsoft.ML.Data/MLContext.cs
index 75c2554570..66337a7ec9 100644
--- a/src/Microsoft.ML.Data/MLContext.cs
+++ b/src/Microsoft.ML.Data/MLContext.cs
@@ -40,6 +40,11 @@ public sealed class MLContext : IHostEnvironment
///
public RankingCatalog Ranking { get; }
+ ///
+ /// Trainers and tasks specific to anomaly detection problems.
+ ///
+ public AnomalyDetectionCatalog AnomalyDetection { get; }
+
///
/// Data processing operations.
///
@@ -83,6 +88,7 @@ public MLContext(int? seed = null, int conc = 0)
Regression = new RegressionCatalog(_env);
Clustering = new ClusteringCatalog(_env);
Ranking = new RankingCatalog(_env);
+ AnomalyDetection = new AnomalyDetectionCatalog(_env);
Transforms = new TransformsCatalog(_env);
Model = new ModelOperationsCatalog(_env);
Data = new DataOperationsCatalog(_env);
diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs
index ef170f448e..2c95b76519 100644
--- a/src/Microsoft.ML.Data/TrainCatalog.cs
+++ b/src/Microsoft.ML.Data/TrainCatalog.cs
@@ -6,6 +6,7 @@
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
+using Microsoft.ML.Data.Evaluators.Metrics;
using Microsoft.ML.Transforms;
using Microsoft.ML.Transforms.Conversions;
@@ -646,4 +647,53 @@ public RankerMetrics Evaluate(IDataView data, string label, string groupId, stri
return eval.Evaluate(data, label, groupId, score);
}
}
+
+ ///
+ /// The central catalog for anomaly detection tasks and trainers.
+ ///
+ public sealed class AnomalyDetectionCatalog : TrainCatalogBase
+ {
+ ///
+ /// The list of trainers for anomaly detection.
+ ///
+ public AnomalyDetectionTrainers Trainers { get; }
+
+ internal AnomalyDetectionCatalog(IHostEnvironment env)
+ : base(env, nameof(AnomalyDetectionCatalog))
+ {
+ Trainers = new AnomalyDetectionTrainers(this);
+ }
+
+ public sealed class AnomalyDetectionTrainers : CatalogInstantiatorBase
+ {
+ internal AnomalyDetectionTrainers(AnomalyDetectionCatalog catalog)
+ : base(catalog)
+ {
+ }
+ }
+
+ ///
+ /// Evaluates scored anomaly detection data.
+ ///
+ /// The scored data.
+ /// The name of the label column in .
+ /// The name of the score column in .
+ /// The name of the predicted label column in .
+ /// The number of false positives to compute the metric.
+ /// Evaluation results.
+ public AnomalyDetectionMetrics Evaluate(IDataView data, string label = DefaultColumnNames.Label, string score = DefaultColumnNames.Score,
+ string predictedLabel = DefaultColumnNames.PredictedLabel, int k = 10)
+ {
+ Environment.CheckValue(data, nameof(data));
+ Environment.CheckNonEmpty(label, nameof(label));
+ Environment.CheckNonEmpty(score, nameof(score));
+ Environment.CheckNonEmpty(predictedLabel, nameof(predictedLabel));
+
+ var args = new AnomalyDetectionEvaluator.Arguments();
+ args.K = k;
+
+ var eval = new AnomalyDetectionEvaluator(Environment, args);
+ return eval.Evaluate(data, label, score, predictedLabel);
+ }
+ }
}
diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs
index d60bf7d1d0..696bf1f741 100644
--- a/src/Microsoft.ML.PCA/PCACatalog.cs
+++ b/src/Microsoft.ML.PCA/PCACatalog.cs
@@ -3,13 +3,14 @@
// See the LICENSE file in the project root for more information.
using Microsoft.ML.Data;
+using Microsoft.ML.Trainers.PCA;
using Microsoft.ML.Transforms.Projections;
+using static Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer;
namespace Microsoft.ML
{
public static class PcaCatalog
{
-
/// Initializes a new instance of .
/// The transform's catalog.
/// Name of the column resulting from the transformation of .
@@ -35,5 +36,40 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t
/// Input columns to apply PrincipalComponentAnalysis on.
public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PrincipalComponentAnalysisEstimator.ColumnInfo[] columns)
=> new PrincipalComponentAnalysisEstimator(CatalogUtils.GetEnvironment(catalog), columns);
+
+ ///
+ /// Trains an approximate PCA using Randomized SVD algorithm.
+ ///
+ /// The anomaly detection catalog trainer object.
+ /// The features, or independent variables.
+ /// The optional example weights.
+ /// The number of components in the PCA.
+ /// Oversampling parameter for randomized PCA training.
+ /// If enabled, data is centered to be zero mean.
+ /// The seed for random number generation.
+ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog,
+ string featureColumn = DefaultColumnNames.Features,
+ string weights = null,
+ int rank = Options.Defaults.NumComponents,
+ int oversampling = Options.Defaults.OversamplingParameters,
+ bool center = Options.Defaults.IsCenteredZeroMean,
+ int? seed = null)
+ {
+ Contracts.CheckValue(catalog, nameof(catalog));
+ var env = CatalogUtils.GetEnvironment(catalog);
+ return new RandomizedPcaTrainer(env, featureColumn, weights, rank, oversampling, center, seed);
+ }
+
+ ///
+ /// Trains an approximate PCA using Randomized SVD algorithm.
+ ///
+ /// The anomaly detection catalog trainer object.
+ /// Advanced options to the algorithm.
+ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options)
+ {
+ Contracts.CheckValue(catalog, nameof(catalog));
+ var env = CatalogUtils.GetEnvironment(catalog);
+ return new RandomizedPcaTrainer(env, options);
+ }
}
}
diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs
index 78ccf43ffb..f406b304f7 100644
--- a/src/Microsoft.ML.PCA/PcaTrainer.cs
+++ b/src/Microsoft.ML.PCA/PcaTrainer.cs
@@ -18,7 +18,7 @@
using Microsoft.ML.Trainers.PCA;
using Microsoft.ML.Training;
-[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Arguments),
+[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Options),
new[] { typeof(SignatureAnomalyDetectorTrainer), typeof(SignatureTrainer) },
RandomizedPcaTrainer.UserNameValue,
RandomizedPcaTrainer.LoadNameValue,
@@ -48,24 +48,31 @@ public sealed class RandomizedPcaTrainer : TrainerEstimatorBaseOversampling parameter for randomized PCA training.
/// If enabled, data is centered to be zero mean.
/// The seed for random number generation.
- public RandomizedPcaTrainer(IHostEnvironment env,
+ internal RandomizedPcaTrainer(IHostEnvironment env,
string features,
string weights = null,
- int rank = 20,
- int oversampling = 20,
- bool center = true,
+ int rank = Options.Defaults.NumComponents,
+ int oversampling = Options.Defaults.OversamplingParameters,
+ bool center = Options.Defaults.IsCenteredZeroMean,
int? seed = null)
: this(env, null, features, weights, rank, oversampling, center, seed)
{
}
- internal RandomizedPcaTrainer(IHostEnvironment env, Arguments args)
- :this(env, args, args.FeatureColumn, args.WeightColumn)
+ internal RandomizedPcaTrainer(IHostEnvironment env, Options options)
+ :this(env, options, options.FeatureColumn, options.WeightColumn)
{
}
- private RandomizedPcaTrainer(IHostEnvironment env, Arguments args, string featureColumn, string weightColumn,
+ private RandomizedPcaTrainer(IHostEnvironment env, Options options, string featureColumn, string weightColumn,
int rank = 20, int oversampling = 20, bool center = true, int? seed = null)
: base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), TrainerUtils.MakeR4VecFeature(featureColumn), default, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn))
{
// if the args are not null, we got here from maml, and the internal ctor.
- if (args != null)
+ if (options != null)
{
- _rank = args.Rank;
- _center = args.Center;
- _oversampling = args.Oversampling;
- _seed = args.Seed ?? Host.Rand.Next();
+ _rank = options.Rank;
+ _center = options.Center;
+ _oversampling = options.Oversampling;
+ _seed = options.Seed ?? Host.Rand.Next();
}
else
{
@@ -346,14 +353,14 @@ protected override AnomalyPredictionTransformer MakeTransfor
Desc = "Train an PCA Anomaly model.",
UserName = UserNameValue,
ShortName = ShortName)]
- internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input)
+ internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Options input)
{
Contracts.CheckValue(env, nameof(env));
var host = env.Register("TrainPCAAnomaly");
host.CheckValue(input, nameof(input));
EntryPointUtils.CheckInputArgs(host, input);
- return LearnerEntryPointsUtils.Train(host, input,
+ return LearnerEntryPointsUtils.Train(host, input,
() => new RandomizedPcaTrainer(host, input),
getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn));
}
diff --git a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs
index 76e82b8d0f..a4571c9c78 100644
--- a/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs
+++ b/src/Microsoft.ML.PCA/Properties/AssemblyInfo.cs
@@ -5,6 +5,7 @@
using System.Runtime.CompilerServices;
using Microsoft.ML;
+[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Tests" + PublicKey.TestValue)]
[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)]
[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Core.Tests" + PublicKey.TestValue)]
diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
index 34822a5f0f..039551b034 100644
--- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
+++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
@@ -63,7 +63,7 @@ Trainers.LogisticRegressionClassifier Logistic Regression is a method in statist
Trainers.NaiveBayesClassifier Train a MultiClassNaiveBayesTrainer. Microsoft.ML.Trainers.MultiClassNaiveBayesTrainer TrainMultiClassNaiveBayesTrainer Microsoft.ML.Trainers.MultiClassNaiveBayesTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+MulticlassClassificationOutput
Trainers.OnlineGradientDescentRegressor Train a Online gradient descent perceptron. Microsoft.ML.Trainers.Online.OnlineGradientDescentTrainer TrainRegression Microsoft.ML.Trainers.Online.OnlineGradientDescentTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput
Trainers.OrdinaryLeastSquaresRegressor Train an OLS regression model. Microsoft.ML.Trainers.HalLearners.OlsLinearRegressionTrainer TrainRegression Microsoft.ML.Trainers.HalLearners.OlsLinearRegressionTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput
-Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput
+Trainers.PcaAnomalyDetector Train an PCA Anomaly model. Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer TrainPcaAnomaly Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+AnomalyDetectionOutput
Trainers.PoissonRegressor Train an Poisson regression model. Microsoft.ML.Trainers.PoissonRegression TrainRegression Microsoft.ML.Trainers.PoissonRegression+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput
Trainers.StochasticDualCoordinateAscentBinaryClassifier Train an SDCA binary model. Microsoft.ML.Trainers.Sdca TrainBinary Microsoft.ML.Trainers.SdcaBinaryTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+BinaryClassificationOutput
Trainers.StochasticDualCoordinateAscentClassifier The SDCA linear multi-class classification trainer. Microsoft.ML.Trainers.Sdca TrainMultiClass Microsoft.ML.Trainers.SdcaMultiClassTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+MulticlassClassificationOutput
diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
index eec14eb56b..2d82b68bbc 100644
--- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
+++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs
@@ -3414,7 +3414,7 @@ public void EntryPointPcaPredictorSummary()
InputFile = inputFile,
}).Data;
- var pcaInput = new RandomizedPcaTrainer.Arguments
+ var pcaInput = new RandomizedPcaTrainer.Options
{
TrainingData = dataView,
};
diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
new file mode 100644
index 0000000000..251b3cc611
--- /dev/null
+++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
@@ -0,0 +1,61 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Drawing;
+using System.Drawing.Imaging;
+using System.IO;
+using System.Linq;
+using Microsoft.Data.DataView;
+using Microsoft.ML.Data;
+using Microsoft.ML.ImageAnalytics;
+using Microsoft.ML.Model;
+using Microsoft.ML.RunTests;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Tests
+{
+ public class AnomalyDetectionTests : TestDataPipeBase
+ {
+ public AnomalyDetectionTests(ITestOutputHelper output) : base(output)
+ {
+ }
+
+ ///
+ /// RandomizedPcaTrainer test
+ ///
+ [Fact]
+ public void RandomizedPcaTrainerBaselineTest()
+ {
+ var mlContext = new MLContext(seed: 1, conc: 1);
+ string featureColumn = "NumericFeatures";
+
+ var reader = new TextLoader(Env, new TextLoader.Arguments()
+ {
+ HasHeader = true,
+ Separator = "\t",
+ Columns = new[]
+ {
+ new TextLoader.Column("Label", DataKind.R4, 0),
+ new TextLoader.Column(featureColumn, DataKind.R4, new [] { new TextLoader.Range(1, 784) })
+ }
+ });
+
+ var trainData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.trainFilename));
+ var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename));
+
+ var pipeline = ML.AnomalyDetection.Trainers.RandomizedPca(featureColumn);
+
+ var transformer = pipeline.Fit(trainData);
+ var transformedData = transformer.Transform(testData);
+
+ // Evaluate
+ var metrics = ML.AnomalyDetection.Evaluate(transformedData, k: 5);
+
+ Assert.Equal(0.98269, metrics.Auc, 5);
+ Assert.Equal(0.90000, metrics.DrAtK, 5);
+ }
+ }
+}