-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Modify API for advanced settings (RandomizedPcaTrainer) #2390
Changes from all commits
77746d6
03e98b8
e5b1a74
8c47da1
398475a
8adb8b1
62f5db8
02b8651
b82c326
b1526d1
d90ded5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using Microsoft.Data.DataView; | ||
|
||
namespace Microsoft.ML.Data.Evaluators.Metrics | ||
{ | ||
/// <summary> | ||
/// Evaluation results for anomaly detection. | ||
/// </summary> | ||
public sealed class AnomalyDetectionMetrics | ||
{ | ||
/// <summary> | ||
/// Gets the area under the ROC curve. | ||
/// </summary> | ||
/// <remarks> | ||
/// The area under the ROC curve is equal to the probability that the algorithm ranks | ||
/// a randomly chosen positive instance higher than a randomly chosen negative one | ||
/// (assuming 'positive' ranks higher than 'negative'). | ||
/// </remarks> | ||
public double Auc { get; } | ||
|
||
/// <summary> | ||
/// Detection rate at K false positives. | ||
/// </summary> | ||
/// <remarks> | ||
/// This is computed as follows: | ||
/// 1.Sort the test examples by the output of the anomaly detector in descending order of scores. | ||
/// 2.Among the top K False Positives, compute ratio : (True Positive @ K) / (Total anomalies in test data) | ||
/// Example confusion matrix for anomaly detection: | ||
/// Anomalies (in test data) | Non-Anomalies (in test data) | ||
/// Predicted Anomalies : TP | FP | ||
/// Predicted Non-Anomalies : FN | TN | ||
/// </remarks> | ||
public double DrAtK { get; } | ||
|
||
internal AnomalyDetectionMetrics(IExceptionContext ectx, Row overallResult) | ||
{ | ||
double FetchDouble(string name) => RowCursorUtils.Fetch<double>(ectx, overallResult, name); | ||
Auc = FetchDouble(BinaryClassifierEvaluator.Auc); | ||
DrAtK = FetchDouble(AnomalyDetectionEvaluator.OverallMetrics.DrAtK); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,14 @@ | |
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Trainers.PCA; | ||
using Microsoft.ML.Transforms.Projections; | ||
using static Microsoft.ML.Trainers.PCA.RandomizedPcaTrainer; | ||
|
||
namespace Microsoft.ML | ||
{ | ||
public static class PcaCatalog | ||
{ | ||
|
||
/// <summary>Initializes a new instance of <see cref="PrincipalComponentAnalysisEstimator"/>.</summary> | ||
/// <param name="catalog">The transform's catalog.</param> | ||
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> | ||
|
@@ -35,5 +36,40 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t | |
/// <param name="columns">Input columns to apply PrincipalComponentAnalysis on.</param> | ||
public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) | ||
=> new PrincipalComponentAnalysisEstimator(CatalogUtils.GetEnvironment(catalog), columns); | ||
|
||
/// <summary> | ||
/// Trains an approximate PCA using Randomized SVD algorithm. | ||
/// </summary> | ||
/// <param name="catalog">The anomaly detection catalog trainer object.</param> | ||
/// <param name="featureColumn">The features, or independent variables.</param> | ||
/// <param name="weights">The optional example weights.</param> | ||
/// <param name="rank">The number of components in the PCA.</param> | ||
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param> | ||
/// <param name="center">If enabled, data is centered to be zero mean.</param> | ||
/// <param name="seed">The seed for random number generation.</param> | ||
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needs xml docs with remarks and links to a sample. Here or add to #1209 . #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
string featureColumn = DefaultColumnNames.Features, | ||
string weights = null, | ||
int rank = Options.Defaults.NumComponents, | ||
int oversampling = Options.Defaults.OversamplingParameters, | ||
bool center = Options.Defaults.IsCenteredZeroMean, | ||
int? seed = null) | ||
{ | ||
Contracts.CheckValue(catalog, nameof(catalog)); | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
return new RandomizedPcaTrainer(env, featureColumn, weights, rank, oversampling, center, seed); | ||
} | ||
|
||
/// <summary> | ||
/// Trains an approximate PCA using Randomized SVD algorithm. | ||
/// </summary> | ||
/// <param name="catalog">The anomaly detection catalog trainer object.</param> | ||
/// <param name="options">Advanced options to the algorithm.</param> | ||
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options) | ||
{ | ||
Contracts.CheckValue(catalog, nameof(catalog)); | ||
var env = CatalogUtils.GetEnvironment(catalog); | ||
return new RandomizedPcaTrainer(env, options); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,7 @@ | |
using Microsoft.ML.Trainers.PCA; | ||
using Microsoft.ML.Training; | ||
|
||
[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Arguments), | ||
[assembly: LoadableClass(RandomizedPcaTrainer.Summary, typeof(RandomizedPcaTrainer), typeof(RandomizedPcaTrainer.Options), | ||
new[] { typeof(SignatureAnomalyDetectorTrainer), typeof(SignatureTrainer) }, | ||
RandomizedPcaTrainer.UserNameValue, | ||
RandomizedPcaTrainer.LoadNameValue, | ||
|
@@ -48,24 +48,31 @@ public sealed class RandomizedPcaTrainer : TrainerEstimatorBase<AnomalyPredictio | |
internal const string Summary = "This algorithm trains an approximate PCA using Randomized SVD algorithm. " | ||
+ "This PCA can be made into Kernel PCA by using Random Fourier Features transform."; | ||
|
||
public class Arguments : UnsupervisedLearnerInputBaseWithWeight | ||
public class Options : UnsupervisedLearnerInputBaseWithWeight | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
xml docs are coming later? #Pending There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
{ | ||
[Argument(ArgumentType.AtMostOnce, HelpText = "The number of components in the PCA", ShortName = "k", SortOrder = 50)] | ||
[TGUI(SuggestedSweeps = "10,20,40,80")] | ||
[TlcModule.SweepableDiscreteParam("Rank", new object[] { 10, 20, 40, 80 })] | ||
public int Rank = 20; | ||
public int Rank = Defaults.NumComponents; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Oversampling parameter for randomized PCA training", SortOrder = 50)] | ||
[TGUI(SuggestedSweeps = "10,20,40")] | ||
[TlcModule.SweepableDiscreteParam("Oversampling", new object[] { 10, 20, 40 })] | ||
public int Oversampling = 20; | ||
public int Oversampling = Defaults.OversamplingParameters; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "If enabled, data is centered to be zero mean", ShortName = "center")] | ||
[TlcModule.SweepableDiscreteParam("Center", null, isBool: true)] | ||
public bool Center = true; | ||
public bool Center = Defaults.IsCenteredZeroMean; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "The seed for random number generation", ShortName = "seed")] | ||
public int? Seed; | ||
|
||
internal static class Defaults | ||
{ | ||
public const int NumComponents = 20; | ||
public const int OversamplingParameters = 20; | ||
public const bool IsCenteredZeroMean = true; | ||
} | ||
} | ||
|
||
private readonly int _rank; | ||
|
@@ -90,35 +97,35 @@ public class Arguments : UnsupervisedLearnerInputBaseWithWeight | |
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param> | ||
/// <param name="center">If enabled, data is centered to be zero mean.</param> | ||
/// <param name="seed">The seed for random number generation.</param> | ||
public RandomizedPcaTrainer(IHostEnvironment env, | ||
internal RandomizedPcaTrainer(IHostEnvironment env, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we just make the class There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really. we would want to expose these through mlcontext. not via constructors In reply to: 254391018 [](ancestors = 254391018) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resolved. I hadn't seen the pattern for trainable transforms where the class is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i believe in ML.NET terms, this is "trainer estimator" (for anomaly detection tasks) most other "trainer estimator"s follow the same pattern e.g. KMeansPlusPlusTrainer In reply to: 255587702 [](ancestors = 255587702) |
||
string features, | ||
string weights = null, | ||
int rank = 20, | ||
int oversampling = 20, | ||
bool center = true, | ||
int rank = Options.Defaults.NumComponents, | ||
int oversampling = Options.Defaults.OversamplingParameters, | ||
bool center = Options.Defaults.IsCenteredZeroMean, | ||
int? seed = null) | ||
: this(env, null, features, weights, rank, oversampling, center, seed) | ||
{ | ||
|
||
} | ||
|
||
internal RandomizedPcaTrainer(IHostEnvironment env, Arguments args) | ||
:this(env, args, args.FeatureColumn, args.WeightColumn) | ||
internal RandomizedPcaTrainer(IHostEnvironment env, Options options) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's strange... I noticed that renaming Arguments to Options did not modify anything in the mlContext catalog. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked it up, and I don't think there is an entry for this trainer in mlContext. Can you add it? In reply to: 253319239 [](ancestors = 253319239) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah. i noticed couple more components which do not have mlcontext extension. will add In reply to: 253319255 [](ancestors = 253319255,253319239) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i added mlcontext extension for this. Also added a test for it that exercises the Fit() and Transform() APIs. Evaluate() API currently missing from Anomaly Detection. i will create a separate issue for that. In reply to: 253584603 [](ancestors = 253584603,253319255,253319239) |
||
:this(env, options, options.FeatureColumn, options.WeightColumn) | ||
{ | ||
|
||
} | ||
|
||
private RandomizedPcaTrainer(IHostEnvironment env, Arguments args, string featureColumn, string weightColumn, | ||
private RandomizedPcaTrainer(IHostEnvironment env, Options options, string featureColumn, string weightColumn, | ||
int rank = 20, int oversampling = 20, bool center = true, int? seed = null) | ||
: base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), TrainerUtils.MakeR4VecFeature(featureColumn), default, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn)) | ||
{ | ||
// if the args are not null, we got here from maml, and the internal ctor. | ||
if (args != null) | ||
if (options != null) | ||
{ | ||
_rank = args.Rank; | ||
_center = args.Center; | ||
_oversampling = args.Oversampling; | ||
_seed = args.Seed ?? Host.Rand.Next(); | ||
_rank = options.Rank; | ||
_center = options.Center; | ||
_oversampling = options.Oversampling; | ||
_seed = options.Seed ?? Host.Rand.Next(); | ||
} | ||
else | ||
{ | ||
|
@@ -346,14 +353,14 @@ protected override AnomalyPredictionTransformer<PcaModelParameters> MakeTransfor | |
Desc = "Train an PCA Anomaly model.", | ||
UserName = UserNameValue, | ||
ShortName = ShortName)] | ||
internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Arguments input) | ||
internal static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironment env, Options input) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above; these can be kept There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The In reply to: 254391534 [](ancestors = 254391534) |
||
{ | ||
Contracts.CheckValue(env, nameof(env)); | ||
var host = env.Register("TrainPCAAnomaly"); | ||
host.CheckValue(input, nameof(input)); | ||
EntryPointUtils.CheckInputArgs(host, input); | ||
|
||
return LearnerEntryPointsUtils.Train<Arguments, CommonOutputs.AnomalyDetectionOutput>(host, input, | ||
return LearnerEntryPointsUtils.Train<Options, CommonOutputs.AnomalyDetectionOutput>(host, input, | ||
() => new RandomizedPcaTrainer(host, input), | ||
getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Summaries, Remarks, and links to relevant documentation. #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added basic summaries for now.
wanted to also add the remarks from TLC website., but the explanations there were not clear esp. for the detection rate metrics.
In reply to: 255583277 [](ancestors = 255583277)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For these summaries, check in with @shmoradims ; he's building a set of generic docs for things like AUC, F1, RMSE, etc.
In reply to: 255703503 [](ancestors = 255703503,255583277)