From cd333c59180c718c3fe13b44daf45bf7360e7844 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 11 Mar 2019 12:43:29 -0700 Subject: [PATCH] TrainTestSplit should be inside MLContext.Data (#2907) * TrainTestSplit should be inside MLContext.Data * fix md files --- docs/code/MlNetCookBook.md | 2 +- .../experimental/MlNetCookBookStaticApi.md | 2 +- .../Dynamic/LogisticRegression.cs | 2 +- .../AveragedPerceptron.cs | 2 +- .../AveragedPerceptronWithOptions.cs | 2 +- .../Calibrators/FixedPlatt.cs | 2 +- .../Calibrators/Isotonic.cs | 2 +- .../BinaryClassification/Calibrators/Naive.cs | 2 +- .../BinaryClassification/Calibrators/Platt.cs | 2 +- .../Trainers/BinaryClassification/LightGbm.cs | 2 +- .../LightGbmWithOptions.cs | 2 +- ...ochasticDualCoordinateAscentWithOptions.cs | 2 +- .../StochasticGradientDescent.cs | 2 +- .../StochasticGradientDescentNonCalibrated.cs | 2 +- ...GradientDescentNonCalibratedWithOptions.cs | 2 +- .../StochasticGradientDescentWithOptions.cs | 2 +- .../SymbolicStochasticGradientDescent.cs | 2 +- ...licStochasticGradientDescentWithOptions.cs | 2 +- .../MulticlassClassification/LightGbm.cs | 2 +- .../LightGbmWithOptions.cs | 2 +- .../StochasticDualCoordinateAscent.cs | 2 +- ...ochasticDualCoordinateAscentWithOptions.cs | 2 +- .../Dynamic/Trainers/Ranking/LightGbm.cs | 4 +- .../Trainers/Ranking/LightGbmWithOptions.cs | 4 +- .../Dynamic/Trainers/Regression/LightGbm.cs | 2 +- .../Regression/LightGbmWithOptions.cs | 2 +- .../Regression/OrdinaryLeastSquares.cs | 2 +- .../OrdinaryLeastSquaresWithOptions.cs | 2 +- .../StochasticDualCoordinateAscent.cs | 2 +- ...ochasticDualCoordinateAscentWithOptions.cs | 2 +- .../AveragedPerceptronBinaryClassification.cs | 2 +- .../Static/FastTreeBinaryClassification.cs | 2 +- .../Static/LightGBMBinaryClassification.cs | 2 +- .../LightGBMMulticlassWithInMemoryData.cs | 2 +- .../Static/LightGBMRegression.cs | 2 +- .../Static/SDCABinaryClassification.cs | 2 +- .../Static/SDCARegression.cs | 2 +- .../DataLoadSave/DataOperationsCatalog.cs | 103 +++++++++++++++++ src/Microsoft.ML.Data/TrainCatalog.cs | 105 +----------------- .../TrainingStaticExtensions.cs | 2 +- .../Prediction.cs | 2 +- .../Validation.cs | 2 +- .../StaticPipeTests.cs | 6 +- .../Training.cs | 2 +- .../Api/CookbookSamples/CookbookSamples.cs | 2 +- .../CookbookSamplesDynamicApi.cs | 2 +- .../Scenarios/Api/TestApi.cs | 8 +- 47 files changed, 155 insertions(+), 157 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 3e9a2b5769..6621fa8969 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -825,7 +825,7 @@ var pipeline = .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()); // Split the data 90:10 into train and test sets, train and evaluate. -var split = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1); +var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Train the model. var model = pipeline.Fit(split.TrainSet); diff --git a/docs/code/experimental/MlNetCookBookStaticApi.md b/docs/code/experimental/MlNetCookBookStaticApi.md index 0d2ef64f46..fcdb2c45ae 100644 --- a/docs/code/experimental/MlNetCookBookStaticApi.md +++ b/docs/code/experimental/MlNetCookBookStaticApi.md @@ -907,7 +907,7 @@ var learningPipeline = loader.MakeNewEstimator() Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features))); // Split the data 90:10 into train and test sets, train and evaluate. -var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1); +var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Train the model. var model = learningPipeline.Fit(trainData); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs index 866348c1b3..8e53da2143 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs @@ -57,7 +57,7 @@ public static void Example() IDataView data = loader.Load(dataFilePath); - var split = ml.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); + var split = ml.Data.TrainTestSplit(data, testFraction: 0.2); var pipeline = ml.Transforms.Concatenate("Text", "workclass", "education", "marital-status", "relationship", "ethnicity", "sex", "native-country") diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs index e9702a8e5a..fd1ff9457a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs @@ -16,7 +16,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create data training pipeline. var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs index fb1dfacf50..6c2693ef8f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs @@ -18,7 +18,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Define the trainer options. var options = new AveragedPerceptronTrainer.Options() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs index 9164eaaca0..52fe41cc4b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs @@ -15,7 +15,7 @@ public static void Example() // Download and featurize the dataset. var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3); // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it. var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs index 91d1586869..9c856d1455 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs @@ -15,7 +15,7 @@ public static void Example() // Download and featurize the dataset. var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3); // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it. var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs index 9cb46e1677..edb38b5cc5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs @@ -15,7 +15,7 @@ public static void Example() // Download and featurize the dataset. var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3); // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it. var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs index 8f15d0c138..12ff762d14 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs @@ -15,7 +15,7 @@ public static void Example() // Download and featurize the dataset. var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3); // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it. var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs index 492802e823..3d2cc59c05 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs @@ -12,7 +12,7 @@ public static void Example() var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1); // Create the Estimator. var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs index 7b0e21fed9..1323d765cf 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs @@ -15,7 +15,7 @@ public static void Example() var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1); // Create the pipeline with LightGbm Estimator using advanced options. var pipeline = mlContext.BinaryClassification.Trainers.LightGbm( diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs index 97475cf45f..f9242829e3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Define the trainer options. var options = new SdcaBinaryTrainer.Options() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs index efcf064a2c..8fad192c44 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs @@ -16,7 +16,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create data training pipeline. var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs index ef2acb80a4..370c2c37c0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs @@ -16,7 +16,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create data training pipeline. var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescentNonCalibrated(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs index 3c799834d1..6a69f5952f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs @@ -18,7 +18,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create data training pipeline. var pipeline = mlContext.BinaryClassification diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs index e98aa8af17..b65cc978ff 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs @@ -18,7 +18,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Define the trainer options. var options = new SgdBinaryTrainer.Options() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs index c3f1e1508e..10496e4b25 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs @@ -17,7 +17,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create data training pipeline. var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent(labelColumnName: "IsOver50K", numberOfIterations: 25); var model = pipeline.Fit(split.TrainSet); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs index de4f4ff386..99ea7f1460 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs @@ -17,7 +17,7 @@ public static void Example() var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext); // Leave out 10% of data for testing. - var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create data training pipeline var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent( new ML.Trainers.SymbolicStochasticGradientDescentClassificationTrainer.Options() diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs index 399ddda16f..e7604ee5bf 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs @@ -37,7 +37,7 @@ public static void Example() // Split the static-typed data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.5); // Train the model. var model = pipeline.Fit(split.TrainSet); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs index 5af98034bf..cebdf2f4bd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs @@ -48,7 +48,7 @@ public static void Example() // Split the static-typed data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.5); // Train the model. var model = pipeline.Fit(split.TrainSet); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs index d99d3368d7..b436fe502e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs @@ -34,7 +34,7 @@ public static void Example() // Split the data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1); // Train the model. var model = pipeline.Fit(split.TrainSet); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs index d803137ce4..7d65bce1db 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs @@ -45,7 +45,7 @@ public static void Example() // Split the data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1); // Train the model. var model = pipeline.Fit(split.TrainSet); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs index 475fbe23c2..8330e0e92c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs @@ -13,8 +13,8 @@ public static void Example() // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in - // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. - var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); + // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose. + var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs index 101d08ec13..e0bc29a0af 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs @@ -16,8 +16,8 @@ public static void Example() // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in - // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose. - var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); + // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose. + var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId"); // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations. var pipeline = mlContext.Ranking.Trainers.LightGbm( diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs index d23aebf141..816b980c9f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs @@ -23,7 +23,7 @@ public static void Example() // 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 ... // 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 ... - var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1); // Create the estimator, here we only need LightGbm trainer // as data is already processed in a form consumable by the trainer. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs index 260c546e7f..86c37af30b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs @@ -25,7 +25,7 @@ public static void Example() // 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 ... // 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 ... - var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1); // Create a pipeline with LightGbm estimator with advanced options. // Here we only need LightGbm trainer as data is already processed diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs index 7fe79c9a43..204322cec7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs @@ -39,7 +39,7 @@ public static void Example() // 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 // 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 - var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.2); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2); // Create the estimator, here we only need OrdinaryLeastSquares trainer // as data is already processed in a form consumable by the trainer diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs index cbbd09342e..ccfa66aeb7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs @@ -40,7 +40,7 @@ public static void Example() // 21.60 0.02731 00.00 7.070 0 0.4690 6.4210 78.90 // 34.70 0.02729 00.00 7.070 0 0.4690 7.1850 61.10 - var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.2); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2); // Create the estimator, here we only need OrdinaryLeastSquares trainer // as data is already processed in a form consumable by the trainer diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs index 7ce8122f78..c97c0e7be1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs @@ -19,7 +19,7 @@ public static void Example() // Split the data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1); // Train the model. var pipeline = mlContext.Regression.Trainers.StochasticDualCoordinateAscent(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs index 5cb29da11c..2f55403e9a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs @@ -18,7 +18,7 @@ public static void Example() // Split the data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1); // Create trainer options. var options = new SdcaRegressionTrainer.Options diff --git a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs index 1f471917c2..34b380d086 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs @@ -55,7 +55,7 @@ public static void AveragedPerceptronBinaryClassification() // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create the Estimator var learningPipeline = loader.MakeNewEstimator() diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs index c9835cd0ff..f34c6c4145 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs @@ -56,7 +56,7 @@ public static void FastTreeBinaryClassification() // Loader the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create the Estimator var learningPipeline = loader.MakeNewEstimator() diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs index 90fa57e933..d2be520631 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs @@ -56,7 +56,7 @@ public static void LightGbmBinaryClassification() // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create the Estimator var learningPipeline = loader.MakeNewEstimator() diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs index 67ea5ce8be..fe987b434d 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs @@ -52,7 +52,7 @@ public void MultiClassLightGbmStaticPipelineWithInMemoryData() // Split the static-typed data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(staticDataView, testFraction: 0.5); + var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5); // Train the model. var model = pipe.Fit(trainingData); diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs index cab1700636..344bf8bbc3 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs @@ -28,7 +28,7 @@ public static void LightGbmRegression() // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(new MultiFileSource(dataFile)); - var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // The predictor that gets produced out of training LightGbmRegressionModelParameters pred = null; diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs index 8150b6400b..8caf039a3d 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs @@ -55,7 +55,7 @@ public static void SdcaBinaryClassification() // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFilePath); - var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Create the Estimator var learningPipeline = loader.MakeNewEstimator() diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs index e4c7188d30..ac4e6b2a7a 100644 --- a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs +++ b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs @@ -25,7 +25,7 @@ public static void SdcaRegression() // Load the data, and leave 10% out, so we can use them for testing var data = loader.Load(dataFile); - var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // The predictor that gets produced out of training LinearRegressionModelParameters pred = null; diff --git a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs index d40ed37a99..f9d60a3429 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs @@ -20,6 +20,31 @@ public sealed class DataOperationsCatalog : IInternalCatalog IHostEnvironment IInternalCatalog.Environment => _env; private readonly IHostEnvironment _env; + /// + /// A pair of datasets, for the train and test set. + /// + public struct TrainTestData + { + /// + /// Training set. + /// + public readonly IDataView TrainSet; + /// + /// Testing set. + /// + public readonly IDataView TestSet; + /// + /// Create pair of datasets. + /// + /// Training set. + /// Testing set. + internal TrainTestData(IDataView trainSet, IDataView testSet) + { + TrainSet = trainSet; + TestSet = testSet; + } + } + internal DataOperationsCatalog(IHostEnvironment env) { Contracts.AssertValue(env); @@ -338,5 +363,83 @@ public IDataView TakeRows(IDataView input, long count) return new SkipTakeFilter(_env, options, input); } + + /// + /// Split the dataset into the train set and test set according to the given fraction. + /// Respects the if provided. + /// + /// The dataset to split. + /// The fraction of data to go into the test set. + /// Name of a column to use for grouping rows. If two examples share the same value of the , + /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set. + /// If no row grouping will be performed. + /// Seed for the random number generator used to select rows for the train-test split. + public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, uint? seed = null) + { + _env.CheckValue(data, nameof(data)); + _env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive"); + _env.CheckValueOrNull(samplingKeyColumn); + + EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumn, seed); + + var trainFilter = new RangeFilter(_env, new RangeFilter.Options() + { + Column = samplingKeyColumn, + Min = 0, + Max = testFraction, + Complement = true + }, data); + var testFilter = new RangeFilter(_env, new RangeFilter.Options() + { + Column = samplingKeyColumn, + Min = 0, + Max = testFraction, + Complement = false + }, data); + + return new TrainTestData(trainFilter, testFilter); + } + + /// + /// Ensures the provided is valid for , hashing it if necessary, or creates a new column is null. + /// + internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDataView data, ref string samplingKeyColumn, uint? seed = null) + { + // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to + // build a single hash of it. If it is not, we generate a random number. + + if (samplingKeyColumn == null) + { + samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn"); + data = new GenerateNumberTransform(env, data, samplingKeyColumn, seed); + } + else + { + if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol)) + throw env.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn); + + var type = data.Schema[stratCol].Type; + if (!RangeFilter.IsValidRangeFilterColumnType(env, type)) + { + // Hash the samplingKeyColumn. + // REVIEW: this could currently crash, since Hash only accepts a limited set + // of column types. It used to be HashJoin, but we should probably extend Hash + // instead of having two hash transformations. + var origStratCol = samplingKeyColumn; + int tmp; + int inc = 0; + + // Generate a new column with the hashed samplingKeyColumn. + while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp)) + samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); + HashingEstimator.ColumnOptions columnOptions; + if (seed.HasValue) + columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value); + else + columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30); + data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data); + } + } + } } } diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index 072b924e6b..7fd623daa5 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -24,67 +24,6 @@ public abstract class TrainCatalogBase : IInternalCatalog [BestFriend] private protected IHostEnvironment Environment { get; } - /// - /// A pair of datasets, for the train and test set. - /// - public struct TrainTestData - { - /// - /// Training set. - /// - public readonly IDataView TrainSet; - /// - /// Testing set. - /// - public readonly IDataView TestSet; - /// - /// Create pair of datasets. - /// - /// Training set. - /// Testing set. - internal TrainTestData(IDataView trainSet, IDataView testSet) - { - TrainSet = trainSet; - TestSet = testSet; - } - } - - /// - /// Split the dataset into the train set and test set according to the given fraction. - /// Respects the if provided. - /// - /// The dataset to split. - /// The fraction of data to go into the test set. - /// Name of a column to use for grouping rows. If two examples share the same value of the , - /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set. - /// If no row grouping will be performed. - /// Seed for the random number generator used to select rows for the train-test split. - public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, uint? seed = null) - { - Environment.CheckValue(data, nameof(data)); - Environment.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive"); - Environment.CheckValueOrNull(samplingKeyColumn); - - EnsureGroupPreservationColumn(ref data, ref samplingKeyColumn, seed); - - var trainFilter = new RangeFilter(Environment, new RangeFilter.Options() - { - Column = samplingKeyColumn, - Min = 0, - Max = testFraction, - Complement = true - }, data); - var testFilter = new RangeFilter(Environment, new RangeFilter.Options() - { - Column = samplingKeyColumn, - Min = 0, - Max = testFraction, - Complement = false - }, data); - - return new TrainTestData(trainFilter, testFilter); - } - /// /// Results for specific cross-validation fold. /// @@ -156,7 +95,7 @@ private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEs Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1"); Environment.CheckValueOrNull(samplingKeyColumn); - EnsureGroupPreservationColumn(ref data, ref samplingKeyColumn, seed); + DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed); Func foldFunction = fold => @@ -199,48 +138,6 @@ private protected TrainCatalogBase(IHostEnvironment env, string registrationName Environment = env; } - /// - /// Ensures the provided is valid for , hashing it if necessary, or creates a new column is null. - /// - private void EnsureGroupPreservationColumn(ref IDataView data, ref string samplingKeyColumn, uint? seed = null) - { - // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to - // build a single hash of it. If it is not, we generate a random number. - - if (samplingKeyColumn == null) - { - samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn"); - data = new GenerateNumberTransform(Environment, data, samplingKeyColumn, seed); - } - else - { - if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol)) - throw Environment.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn); - - var type = data.Schema[stratCol].Type; - if (!RangeFilter.IsValidRangeFilterColumnType(Environment, type)) - { - // Hash the samplingKeyColumn. - // REVIEW: this could currently crash, since Hash only accepts a limited set - // of column types. It used to be HashJoin, but we should probably extend Hash - // instead of having two hash transformations. - var origStratCol = samplingKeyColumn; - int tmp; - int inc = 0; - - // Generate a new column with the hashed samplingKeyColumn. - while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp)) - samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); - HashingEstimator.ColumnOptions columnOptions; - if (seed.HasValue) - columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value); - else - columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30); - data = new HashingEstimator(Environment, columnOptions).Fit(data).Transform(data); - } - } - } - /// /// Subclasses of will provide little "extension method" hookable objects /// (for example, something like ). User code will only diff --git a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs index bd2ccad153..2713e96f1c 100644 --- a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs @@ -30,7 +30,7 @@ public static class TrainingStaticExtensions /// If the is not provided, the random numbers generated to create it, will use this seed as value. /// And if it is not provided, the default value will be used. /// A pair of datasets, for the train and test set. - public static (DataView trainSet, DataView testSet) TrainTestSplit(this TrainCatalogBase catalog, + public static (DataView trainSet, DataView testSet) TrainTestSplit(this DataOperationsCatalog catalog, DataView data, double testFraction = 0.1, Func stratificationColumn = null, uint? seed = null) { var env = StaticPipeUtils.GetEnvironment(data); diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index 9bad405571..b0d1fee62a 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -25,7 +25,7 @@ public void ReconfigurablePrediction() var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); - var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); + var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.2); // Create a pipeline to train on the housing data var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index 49a5db6693..86157e2626 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -65,7 +65,7 @@ public void TrainWithValidationSet() var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); // Create the train and validation set. - var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); + var dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; var validData = dataSplit.TestSet; diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 813ec4c352..629eec9486 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -722,13 +722,11 @@ public void TrainTestSplit() var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); - var ctx = new BinaryClassificationCatalog(env); - var reader = TextLoaderStatic.CreateLoader(env, c => (label: c.LoadFloat(0), features: c.LoadFloat(1, 4))); var data = reader.Load(dataSource); - var (train, test) = ctx.TrainTestSplit(data, 0.5); + var (train, test) = env.Data.TrainTestSplit(data, 0.5); // Just make sure that the train is about the same size as the test set. var trainCount = train.GetColumn(r => r.label).Count(); @@ -737,7 +735,7 @@ public void TrainTestSplit() Assert.InRange(trainCount * 1.0 / testCount, 0.8, 1.2); // Now stratify by label. Silly thing to do. - (train, test) = ctx.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label); + (train, test) = env.Data.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label); var trainLabels = train.GetColumn(r => r.label).Distinct(); var testLabels = test.GetColumn(r => r.label).Distinct(); Assert.True(trainLabels.Count() > 0); diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs index 130b4cfba3..11e76586f0 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs @@ -1232,7 +1232,7 @@ public void MultiClassLightGbmStaticPipelineWithInMemoryData() // Split the static-typed data into training and test sets. Only training set is used in fitting // the created pipeline. Metrics are computed on the test. - var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(staticDataView, testFraction: 0.5); + var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5); // Train the model. var model = pipe.Fit(trainingData); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 8e7e97145c..03a36249cb 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -601,7 +601,7 @@ private void CrossValidationOn(string dataPath) Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features))); // Split the data 90:10 into train and test sets, train and evaluate. - var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1); + var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Train the model. var model = pipeline.Fit(trainData); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 77c6145849..bf7ab04d51 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -426,7 +426,7 @@ private void CrossValidationOn(string dataPath) .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()); // Split the data 90:10 into train and test sets, train and evaluate. - var split = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1); + var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1); // Train the model. var model = pipeline.Fit(split.TrainSet); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs index a1fb6691e0..574db0f901 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs @@ -313,8 +313,8 @@ public void TestTrainTestSplit() // Let's test what train test properly works with seed. // In order to do that, let's split same dataset, but in one case we will use default seed value, // and in other case we set seed to be specific value. - var simpleSplit = mlContext.BinaryClassification.TrainTestSplit(input); - var splitWithSeed = mlContext.BinaryClassification.TrainTestSplit(input, seed: 10); + var simpleSplit = mlContext.Data.TrainTestSplit(input); + var splitWithSeed = mlContext.Data.TrainTestSplit(input, seed: 10); // Since test fraction is 0.1, it's much faster to compare test subsets of split. var simpleTestWorkClass = getWorkclass(simpleSplit.TestSet); @@ -326,7 +326,7 @@ public void TestTrainTestSplit() // Now let's do same thing but with presence of stratificationColumn. // Rows with same values in this stratificationColumn should end up in same subset (train or test). // So let's break dataset by "Workclass" column. - var stratSplit = mlContext.BinaryClassification.TrainTestSplit(input, samplingKeyColumn: "Workclass"); + var stratSplit = mlContext.Data.TrainTestSplit(input, samplingKeyColumn: "Workclass"); var stratTrainWorkclass = getWorkclass(stratSplit.TrainSet); var stratTestWorkClass = getWorkclass(stratSplit.TestSet); // Let's get unique values for "Workclass" column from train subset. @@ -338,7 +338,7 @@ public void TestTrainTestSplit() // Let's do same thing, but this time we will choose different seed. // Stratification column should still break dataset properly without same values in both subsets. - var stratSeed = mlContext.BinaryClassification.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000); + var stratSeed = mlContext.Data.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000); var stratTrainWithSeedWorkclass = getWorkclass(stratSeed.TrainSet); var stratTestWithSeedWorkClass = getWorkclass(stratSeed.TestSet); // Let's get unique values for "Workclass" column from train subset.