From 348a98de7d50442868a286740c6910391cc48cd9 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 14 Mar 2019 15:05:19 -0700 Subject: [PATCH 1/2] Checkign in the samples generated during bug bash for MissingNa, ReplaceNA and OneHot --- .../Transforms/Categorical/OneHotEncoding.cs | 85 +++++++++++++++ .../Transforms/IndicateMissingValues.cs | 75 +++++++++++++ .../Transforms/ReplaceMissingValues.cs | 102 ++++++++++++++++++ docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../CategoricalCatalog.cs | 6 ++ .../ExtensionsCatalog.cs | 12 +++ 6 files changed, 281 insertions(+), 1 deletion(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs new file mode 100644 index 0000000000..43c5e56d36 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs @@ -0,0 +1,85 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using static Microsoft.ML.Transforms.OneHotEncodingEstimator; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class OneHotEncoding + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable. + var samples = new List() + { + new DataPoint(){ Label = 0, Education = "0-5yrs" }, + new DataPoint(){ Label = 1, Education = "0-5yrs" }, + new DataPoint(){ Label = 45, Education = "6-11yrs" }, + new DataPoint(){ Label = 50, Education = "6-11yrs" }, + new DataPoint(){ Label = 50, Education = "11-15yrs" }, + }; + + // Convert training data to IDataView. + var trainData = ml.Data.LoadFromEnumerable(samples); + + // A pipeline for one hot encoding the Education column. + var bagPipeline = ml.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag); + // Fit to data. + var bagTransformer = bagPipeline.Fit(trainData); + + // Get transformed data + var bagTransformedData = bagTransformer.Transform(trainData); + // Getting the data of the newly created column, so we can preview it. + var bagEncodedColumn = bagTransformedData.GetColumn("EducationOneHotEncoded"); + + var keyPipeline = ml.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key); + // Fit to data. + var keyTransformer = keyPipeline.Fit(trainData); + + // Get transformed data + var keyTransformedData = keyTransformer.Transform(trainData); + // Getting the data of the newly created column, so we can preview it. + var keyEncodedColumn = keyTransformedData.GetColumn("EducationOneHotEncoded"); + + Console.WriteLine("One Hot Encoding based on the bagging strategy."); + foreach (var row in bagEncodedColumn) + { + for (var i = 0; i < row.Length; i++) + Console.Write($"{row[i]} "); + } + + // data column obtained post-transformation. + // Since there are only two categories in the Education column of the trainData, the output vector + // for one hot will have two slots. + // + // 0 0 0 + // 0 0 0 + // 0 0 1 + // 0 0 1 + // 0 1 0 + + Console.WriteLine("One Hot Encoding with key type output."); + foreach (var element in keyEncodedColumn) + Console.WriteLine(element); + + // 1 + // 1 + // 2 + // 2 + // 3 + + } + + private class DataPoint + { + public float Label { get; set; } + + public string Education { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs new file mode 100644 index 0000000000..1e7165363a --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -0,0 +1,75 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class IndicateMissingValues + { + + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + var samples = new List() + { + new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, + new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} }, + }; + // Convert training data to IDataView, the general data type used in ML.NET. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // IndicateMissingValues is used to create a boolean containing + // 'true' where the value in the input column is NaN. This value can be used + // to replace missing values with other values. + + IEstimator pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var tansformer = pipeline.Fit(data); + var transformedData = tansformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); + + // a small printing utility + Func vectorPrinter = (object[] vector) => + { + string preview = "["; + foreach (var slot in vector) + preview += $"{slot} "; + return preview += "]"; + + }; + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in rowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast().ToArray())}"); + } + + // Expected output: + // + // Label: 3 Features: [1 1 0] MissingIndicator: [False False False] + // Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False] + // Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False] + } + + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + public bool[] MissingIndicator { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs new file mode 100644 index 0000000000..1bcc4ef5f5 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -0,0 +1,102 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using static Microsoft.ML.Transforms.MissingValueReplacingEstimator.ColumnOptions; + +namespace Microsoft.ML.Samples.Dynamic +{ + class ReplaceMissingValues + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + var samples = new List() + { + new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, + new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, + new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, + }; + // Convert training data to IDataView, the general data type used in ML.NET. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.Mean); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); + + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.DefaultValue); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var defaultTransformer = defaultPipeline.Fit(data); + var defaultTransformedData = defaultTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); + + // a small printing utility + Func vectorPrinter = (object[] vector) => + { + string preview = "["; + foreach (var slot in vector) + preview += $"{slot} "; + return preview += "]"; + + }; + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in meanRowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); + } + + // Expected output: + // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row + // + //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] + //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1] + //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3] + //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3] + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in defaultRowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); + } + + // Expected output: + // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. + // + //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] + //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1] + //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3] + //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3] + } + + private class DataPoint + { + public float Label { get; set; } + + [VectorType(3)] + public float[] Features { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + [VectorType(3)] + public float[] MissingReplaced { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 00424b9516..ef67739045 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - CustomMapping.Example(); + ReplaceMissingValues.Example(); } } } diff --git a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs index 6d17d4ed78..59c5553d2a 100644 --- a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs +++ b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs @@ -20,6 +20,12 @@ public static class CategoricalCatalog /// Name of the column resulting from the transformation of . /// Name of column to transform. If set to , the value of the will be used as source. /// The conversion mode. + /// + /// + /// + /// public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog, string outputColumnName, string inputColumnName = null, diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index 56b1739034..2e283ad89f 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -29,6 +29,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// Name of the column resulting from the transformation of . /// Name of column to transform. If set to , the value of the will be used as source. /// If left to null the will get replaced. + /// + /// + /// + /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, string outputColumnName, string inputColumnName = null) @@ -46,6 +52,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// Name of column to transform. If set to , the value of the will be used as source. /// If not provided, the will be replaced with the results of the transforms. /// The type of replacement to use as specified in + /// + /// + /// + /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, string outputColumnName, string inputColumnName = null, From df383d69ba4746b991e49b9f43e1db452960df50 Mon Sep 17 00:00:00 2001 From: Senja Filipi Date: Thu, 14 Mar 2019 16:11:27 -0700 Subject: [PATCH 2/2] PR comments --- .../Dynamic/Transforms/Categorical/OneHotEncoding.cs | 8 ++++---- .../Dynamic/Transforms/IndicateMissingValues.cs | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs index 43c5e56d36..5fd246bd5a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs @@ -12,7 +12,7 @@ public static void Example() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. - var ml = new MLContext(); + var mlContext = new MLContext(); // Get a small dataset as an IEnumerable. var samples = new List() @@ -25,10 +25,10 @@ public static void Example() }; // Convert training data to IDataView. - var trainData = ml.Data.LoadFromEnumerable(samples); + var trainData = mlContext.Data.LoadFromEnumerable(samples); // A pipeline for one hot encoding the Education column. - var bagPipeline = ml.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag); + var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag); // Fit to data. var bagTransformer = bagPipeline.Fit(trainData); @@ -37,7 +37,7 @@ public static void Example() // Getting the data of the newly created column, so we can preview it. var bagEncodedColumn = bagTransformedData.GetColumn("EducationOneHotEncoded"); - var keyPipeline = ml.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key); + var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key); // Fit to data. var keyTransformer = keyPipeline.Fit(trainData); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs index 1e7165363a..15d448deee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -26,7 +26,6 @@ public static void Example() // IndicateMissingValues is used to create a boolean containing // 'true' where the value in the input column is NaN. This value can be used // to replace missing values with other values. - IEstimator pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); // Now we can transform the data and look at the output to confirm the behavior of the estimator.