Binary classification samples update (#3311)

dotnet · Apr 16, 2019 · 5538ccf · 5538ccf
1 parent 2e99197
commit 5538ccf
Show file tree

Hide file tree

Showing 61 changed files with 2,760 additions and 702 deletions.
diff --git a/.../samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs b/.../samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
@@ -1,44 +1,110 @@
-using Microsoft.ML;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.Data;
 
 namespace Samples.Dynamic.Trainers.BinaryClassification
 {
     public static class AveragedPerceptron
     {
-        // In this examples we will use the adult income dataset. The goal is to predict
-        // if a person's income is above $50K or not, based on demographic information about that person.
-        // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
         public static void Example()
         {
             // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
             // as a catalog of available operations and as the source of randomness.
             // Setting the seed to a fixed number in this example to make outputs deterministic.
             var mlContext = new MLContext(seed: 0);
 
-            // Download and featurize the dataset.
-            var data = Microsoft.ML.SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
+            // Create a list of training data points.
+            var dataPoints = GenerateRandomDataPoints(1000);
 
-            // Leave out 10% of data for testing.
-            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
+            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
+            var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);
 
-            // Create data training pipeline.
-            var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(numberOfIterations: 10);
+            // Define the trainer.
+            var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
 
-            // Fit this pipeline to the training data.
-            var model = pipeline.Fit(trainTestData.TrainSet);
+            // Train the model.
+            var model = pipeline.Fit(trainingData);
 
-            // Evaluate how the model is doing on the test data.
-            var dataWithPredictions = model.Transform(trainTestData.TestSet);
-            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions);
-            Microsoft.ML.SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+            // Create testing data. Use different random seed to make it different from training data.
+            var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed:123));
 
+            // Run the model on test data set.
+            var transformedTestData = model.Transform(testData);
+
+            // Convert IDataView object to a list.
+            var predictions = mlContext.Data.CreateEnumerable<Prediction>(transformedTestData, reuseRowObject: false).ToList();
+
+            // Print 5 predictions.
+            foreach (var p in predictions.Take(5))
+                Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
+
+            // Expected output:
+            //   Label: True, Prediction: True
+            //   Label: False, Prediction: False
+            //   Label: True, Prediction: True
+            //   Label: True, Prediction: False
+            //   Label: False, Prediction: False
+
+            // Evaluate the overall metrics.
+            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(transformedTestData);
+            PrintMetrics(metrics);
+
             // Expected output:
-            //   Accuracy: 0.86
-            //   AUC: 0.91
-            //   F1 Score: 0.68
-            //   Negative Precision: 0.90
-            //   Negative Recall: 0.91
-            //   Positive Precision: 0.70
-            //   Positive Recall: 0.66
+			//   Accuracy: 0.72
+			//   AUC: 0.79
+			//   F1 Score: 0.68
+			//   Negative Precision: 0.71
+			//   Negative Recall: 0.80
+			//   Positive Precision: 0.74
+			//   Positive Recall: 0.63
+        }
+
+        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed=0)
+        {
+            var random = new Random(seed);
+            float randomFloat() => (float)random.NextDouble();
+            for (int i = 0; i < count; i++)
+            {
+                var label = randomFloat() > 0.5f;
+                yield return new DataPoint
+                {
+                    Label = label,
+                    // Create random features that are correlated with the label.
+                    // For data points with false label, the feature values are slightly increased by adding a constant.
+                    Features = Enumerable.Repeat(label, 50).Select(x => x ? randomFloat() : randomFloat() + 0.1f).ToArray()
+                };
+            }
+        }
+
+        // Example with label and 50 feature values. A data set is a collection of such examples.
+        private class DataPoint
+        {
+            public bool Label { get; set; }
+            [VectorType(50)]
+            public float[] Features { get; set; }
+        }
+
+        // Class used to capture predictions.
+        private class Prediction
+        {
+            // Original label.
+            public bool Label { get; set; }
+            // Predicted label from the trainer.
+            public bool PredictedLabel { get; set; }
+        }
+
+        // Pretty-print BinaryClassificationMetrics objects.
+        private static void PrintMetrics(BinaryClassificationMetrics metrics)
+        {
+            Console.WriteLine($"Accuracy: {metrics.Accuracy:F2}");
+            Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve:F2}");
+            Console.WriteLine($"F1 Score: {metrics.F1Score:F2}");
+            Console.WriteLine($"Negative Precision: {metrics.NegativePrecision:F2}");
+            Console.WriteLine($"Negative Recall: {metrics.NegativeRecall:F2}");
+            Console.WriteLine($"Positive Precision: {metrics.PositivePrecision:F2}");
+            Console.WriteLine($"Positive Recall: {metrics.PositiveRecall:F2}");
         }
     }
-}
+}
diff --git a/.../samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.tt b/.../samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.tt
@@ -0,0 +1,29 @@
+<#@ include file="BinaryClassification.ttinclude"#>
+<#+
+string ClassName = "AveragedPerceptron";
+string Trainer = "AveragedPerceptron";
+string TrainerOptions = null;
+bool IsCalibrated = false;
+bool CacheData = false;
+
+string LabelThreshold = "0.5f";
+string DataSepValue = "0.1f";
+string OptionsInclude = "";
+string Comments= "";
+
+string ExpectedOutputPerInstance = @"// Expected output:
+            //   Label: True, Prediction: True
+            //   Label: False, Prediction: False
+            //   Label: True, Prediction: True
+            //   Label: True, Prediction: False
+            //   Label: False, Prediction: False";
+
+string ExpectedOutput = @"// Expected output:
+			//   Accuracy: 0.72
+			//   AUC: 0.79
+			//   F1 Score: 0.68
+			//   Negative Precision: 0.71
+			//   Negative Recall: 0.80
+			//   Positive Precision: 0.74
+			//   Positive Recall: 0.63";
+#>
diff --git a/...crosoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs b/...crosoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
@@ -1,28 +1,29 @@
-using Microsoft.ML;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.Data;
 using Microsoft.ML.Trainers;
 
 namespace Samples.Dynamic.Trainers.BinaryClassification
 {
     public static class AveragedPerceptronWithOptions
     {
-        // In this examples we will use the adult income dataset. The goal is to predict
-        // if a person's income is above $50K or not, based on demographic information about that person.
-        // For more details about this dataset, please see https://archive.ics.uci.edu/ml/datasets/adult.
         public static void Example()
         {
             // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
             // as a catalog of available operations and as the source of randomness.
             // Setting the seed to a fixed number in this example to make outputs deterministic.
             var mlContext = new MLContext(seed: 0);
 
-            // Download and featurize the dataset.
-            var data = Microsoft.ML.SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
+            // Create a list of training data points.
+            var dataPoints = GenerateRandomDataPoints(1000);
 
-            // Leave out 10% of data for testing.
-            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
+            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
+            var trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);
 
-            // Define the trainer options.
-            var options = new AveragedPerceptronTrainer.Options()
+            // Define trainer options.
+            var options = new AveragedPerceptronTrainer.Options
             {
                 LossFunction = new SmoothedHingeLoss(),
                 LearningRate = 0.1f,
@@ -31,25 +32,90 @@ public static void Example()
                 NumberOfIterations = 10
             };
 
-            // Create data training pipeline.
+            // Define the trainer.
             var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(options);
 
-            // Fit this pipeline to the training data.
-            var model = pipeline.Fit(trainTestData.TrainSet);
+            // Train the model.
+            var model = pipeline.Fit(trainingData);
 
-            // Evaluate how the model is doing on the test data.
-            var dataWithPredictions = model.Transform(trainTestData.TestSet);
-            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(dataWithPredictions);
-            Microsoft.ML.SamplesUtils.ConsoleUtils.PrintMetrics(metrics);
+            // Create testing data. Use different random seed to make it different from training data.
+            var testData = mlContext.Data.LoadFromEnumerable(GenerateRandomDataPoints(500, seed:123));
+
+            // Run the model on test data set.
+            var transformedTestData = model.Transform(testData);
+
+            // Convert IDataView object to a list.
+            var predictions = mlContext.Data.CreateEnumerable<Prediction>(transformedTestData, reuseRowObject: false).ToList();
+
+            // Print 5 predictions.
+            foreach (var p in predictions.Take(5))
+                Console.WriteLine($"Label: {p.Label}, Prediction: {p.PredictedLabel}");
 
             // Expected output:
-            //  Accuracy: 0.86
-            //  AUC: 0.90
-            //  F1 Score: 0.66
-            //  Negative Precision: 0.89
-            //  Negative Recall: 0.93
-            //  Positive Precision: 0.72
-            //  Positive Recall: 0.61
+            //   Label: True, Prediction: True
+            //   Label: False, Prediction: False
+            //   Label: True, Prediction: True
+            //   Label: True, Prediction: True
+            //   Label: False, Prediction: False
+
+            // Evaluate the overall metrics.
+            var metrics = mlContext.BinaryClassification.EvaluateNonCalibrated(transformedTestData);
+            PrintMetrics(metrics);
+
+            // Expected output:
+			//   Accuracy: 0.89
+			//   AUC: 0.96
+			//   F1 Score: 0.88
+			//   Negative Precision: 0.87
+			//   Negative Recall: 0.92
+			//   Positive Precision: 0.91
+			//   Positive Recall: 0.85
+        }
+
+        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed=0)
+        {
+            var random = new Random(seed);
+            float randomFloat() => (float)random.NextDouble();
+            for (int i = 0; i < count; i++)
+            {
+                var label = randomFloat() > 0.5f;
+                yield return new DataPoint
+                {
+                    Label = label,
+                    // Create random features that are correlated with the label.
+                    // For data points with false label, the feature values are slightly increased by adding a constant.
+                    Features = Enumerable.Repeat(label, 50).Select(x => x ? randomFloat() : randomFloat() + 0.1f).ToArray()
+                };
+            }
+        }
+
+        // Example with label and 50 feature values. A data set is a collection of such examples.
+        private class DataPoint
+        {
+            public bool Label { get; set; }
+            [VectorType(50)]
+            public float[] Features { get; set; }
+        }
+
+        // Class used to capture predictions.
+        private class Prediction
+        {
+            // Original label.
+            public bool Label { get; set; }
+            // Predicted label from the trainer.
+            public bool PredictedLabel { get; set; }
+        }
+
+        // Pretty-print BinaryClassificationMetrics objects.
+        private static void PrintMetrics(BinaryClassificationMetrics metrics)
+        {
+            Console.WriteLine($"Accuracy: {metrics.Accuracy:F2}");
+            Console.WriteLine($"AUC: {metrics.AreaUnderRocCurve:F2}");
+            Console.WriteLine($"F1 Score: {metrics.F1Score:F2}");
+            Console.WriteLine($"Negative Precision: {metrics.NegativePrecision:F2}");
+            Console.WriteLine($"Negative Recall: {metrics.NegativeRecall:F2}");
+            Console.WriteLine($"Positive Precision: {metrics.PositivePrecision:F2}");
+            Console.WriteLine($"Positive Recall: {metrics.PositiveRecall:F2}");
         }
     }
-}
+}
diff --git a/...crosoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.tt b/...crosoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.tt
@@ -0,0 +1,37 @@
+<#@ include file="BinaryClassification.ttinclude"#>
+<#+
+string ClassName="AveragedPerceptronWithOptions";
+string Trainer = "AveragedPerceptron";
+bool IsCalibrated = false;
+
+string LabelThreshold = "0.5f";
+string DataSepValue = "0.1f";
+string OptionsInclude = "using Microsoft.ML.Trainers;";
+string Comments= "";
+bool CacheData = false;
+
+string TrainerOptions = @"AveragedPerceptronTrainer.Options
+            {
+                LossFunction = new SmoothedHingeLoss(),
+                LearningRate = 0.1f,
+                LazyUpdate = false,
+                RecencyGain = 0.1f,
+                NumberOfIterations = 10
+            }";
+
+string ExpectedOutputPerInstance= @"// Expected output:
+            //   Label: True, Prediction: True
+            //   Label: False, Prediction: False
+            //   Label: True, Prediction: True
+            //   Label: True, Prediction: True
+            //   Label: False, Prediction: False";
+
+string ExpectedOutput = @"// Expected output:
+			//   Accuracy: 0.89
+			//   AUC: 0.96
+			//   F1 Score: 0.88
+			//   Negative Precision: 0.87
+			//   Negative Recall: 0.92
+			//   Positive Precision: 0.91
+			//   Positive Recall: 0.85";
+#>