From cd333c59180c718c3fe13b44daf45bf7360e7844 Mon Sep 17 00:00:00 2001
From: Gani Nazirov <ganinz@hotmail.com>
Date: Mon, 11 Mar 2019 12:43:29 -0700
Subject: [PATCH] TrainTestSplit should be inside MLContext.Data (#2907)

* TrainTestSplit should be inside MLContext.Data

* fix md files
---
 docs/code/MlNetCookBook.md                    |   2 +-
 .../experimental/MlNetCookBookStaticApi.md    |   2 +-
 .../Dynamic/LogisticRegression.cs             |   2 +-
 .../AveragedPerceptron.cs                     |   2 +-
 .../AveragedPerceptronWithOptions.cs          |   2 +-
 .../Calibrators/FixedPlatt.cs                 |   2 +-
 .../Calibrators/Isotonic.cs                   |   2 +-
 .../BinaryClassification/Calibrators/Naive.cs |   2 +-
 .../BinaryClassification/Calibrators/Platt.cs |   2 +-
 .../Trainers/BinaryClassification/LightGbm.cs |   2 +-
 .../LightGbmWithOptions.cs                    |   2 +-
 ...ochasticDualCoordinateAscentWithOptions.cs |   2 +-
 .../StochasticGradientDescent.cs              |   2 +-
 .../StochasticGradientDescentNonCalibrated.cs |   2 +-
 ...GradientDescentNonCalibratedWithOptions.cs |   2 +-
 .../StochasticGradientDescentWithOptions.cs   |   2 +-
 .../SymbolicStochasticGradientDescent.cs      |   2 +-
 ...licStochasticGradientDescentWithOptions.cs |   2 +-
 .../MulticlassClassification/LightGbm.cs      |   2 +-
 .../LightGbmWithOptions.cs                    |   2 +-
 .../StochasticDualCoordinateAscent.cs         |   2 +-
 ...ochasticDualCoordinateAscentWithOptions.cs |   2 +-
 .../Dynamic/Trainers/Ranking/LightGbm.cs      |   4 +-
 .../Trainers/Ranking/LightGbmWithOptions.cs   |   4 +-
 .../Dynamic/Trainers/Regression/LightGbm.cs   |   2 +-
 .../Regression/LightGbmWithOptions.cs         |   2 +-
 .../Regression/OrdinaryLeastSquares.cs        |   2 +-
 .../OrdinaryLeastSquaresWithOptions.cs        |   2 +-
 .../StochasticDualCoordinateAscent.cs         |   2 +-
 ...ochasticDualCoordinateAscentWithOptions.cs |   2 +-
 .../AveragedPerceptronBinaryClassification.cs |   2 +-
 .../Static/FastTreeBinaryClassification.cs    |   2 +-
 .../Static/LightGBMBinaryClassification.cs    |   2 +-
 .../LightGBMMulticlassWithInMemoryData.cs     |   2 +-
 .../Static/LightGBMRegression.cs              |   2 +-
 .../Static/SDCABinaryClassification.cs        |   2 +-
 .../Static/SDCARegression.cs                  |   2 +-
 .../DataLoadSave/DataOperationsCatalog.cs     | 103 +++++++++++++++++
 src/Microsoft.ML.Data/TrainCatalog.cs         | 105 +-----------------
 .../TrainingStaticExtensions.cs               |   2 +-
 .../Prediction.cs                             |   2 +-
 .../Validation.cs                             |   2 +-
 .../StaticPipeTests.cs                        |   6 +-
 .../Training.cs                               |   2 +-
 .../Api/CookbookSamples/CookbookSamples.cs    |   2 +-
 .../CookbookSamplesDynamicApi.cs              |   2 +-
 .../Scenarios/Api/TestApi.cs                  |   8 +-
 47 files changed, 155 insertions(+), 157 deletions(-)

diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md
index 3e9a2b5769..6621fa8969 100644
--- a/docs/code/MlNetCookBook.md
+++ b/docs/code/MlNetCookBook.md
@@ -825,7 +825,7 @@ var pipeline =
     .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent());
 
 // Split the data 90:10 into train and test sets, train and evaluate.
-var split = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1);
+var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
 // Train the model.
 var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/code/experimental/MlNetCookBookStaticApi.md b/docs/code/experimental/MlNetCookBookStaticApi.md
index 0d2ef64f46..fcdb2c45ae 100644
--- a/docs/code/experimental/MlNetCookBookStaticApi.md
+++ b/docs/code/experimental/MlNetCookBookStaticApi.md
@@ -907,7 +907,7 @@ var learningPipeline = loader.MakeNewEstimator()
         Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features)));
 
 // Split the data 90:10 into train and test sets, train and evaluate.
-var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1);
+var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
 // Train the model.
 var model = learningPipeline.Fit(trainData);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs
index 866348c1b3..8e53da2143 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LogisticRegression.cs
@@ -57,7 +57,7 @@ public static void Example()
 
             IDataView data = loader.Load(dataFilePath);
 
-            var split = ml.BinaryClassification.TrainTestSplit(data, testFraction: 0.2);
+            var split = ml.Data.TrainTestSplit(data, testFraction: 0.2);
 
             var pipeline = ml.Transforms.Concatenate("Text", "workclass", "education", "marital-status",
                     "relationship", "ethnicity", "sex", "native-country")
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
index e9702a8e5a..fd1ff9457a 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptron.cs
@@ -16,7 +16,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create data training pipeline.
             var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron(numIterations: 10);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
index fb1dfacf50..6c2693ef8f 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/AveragedPerceptronWithOptions.cs
@@ -18,7 +18,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Define the trainer options.
             var options = new AveragedPerceptronTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs
index 9164eaaca0..52fe41cc4b 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/FixedPlatt.cs
@@ -15,7 +15,7 @@ public static void Example()
             // Download and featurize the dataset.
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
 
             // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
             var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs
index 91d1586869..9c856d1455 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Isotonic.cs
@@ -15,7 +15,7 @@ public static void Example()
             // Download and featurize the dataset.
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
 
             // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
             var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs
index 9cb46e1677..edb38b5cc5 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Naive.cs
@@ -15,7 +15,7 @@ public static void Example()
             // Download and featurize the dataset.
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
 
             // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
             var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs
index 8f15d0c138..12ff762d14 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/Calibrators/Platt.cs
@@ -15,7 +15,7 @@ public static void Example()
             // Download and featurize the dataset.
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.3);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.3);
 
             // Create data training pipeline for non calibrated trainer and train Naive calibrator on top of it.
             var pipeline = mlContext.BinaryClassification.Trainers.AveragedPerceptron();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
index 492802e823..3d2cc59c05 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbm.cs
@@ -12,7 +12,7 @@ public static void Example()
             var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1);
 
             // Create the Estimator.
             var pipeline = mlContext.BinaryClassification.Trainers.LightGbm();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
index 7b0e21fed9..1323d765cf 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/LightGbmWithOptions.cs
@@ -15,7 +15,7 @@ public static void Example()
             var dataview = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var split = mlContext.BinaryClassification.TrainTestSplit(dataview, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1);
 
             // Create the pipeline with LightGbm Estimator using advanced options.
             var pipeline = mlContext.BinaryClassification.Trainers.LightGbm(
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs
index 97475cf45f..f9242829e3 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentWithOptions.cs
@@ -19,7 +19,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Define the trainer options.
             var options = new SdcaBinaryTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
index efcf064a2c..8fad192c44 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescent.cs
@@ -16,7 +16,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create data training pipeline.
             var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescent();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs
index ef2acb80a4..370c2c37c0 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibrated.cs
@@ -16,7 +16,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create data training pipeline.
             var pipeline = mlContext.BinaryClassification.Trainers.StochasticGradientDescentNonCalibrated();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs
index 3c799834d1..6a69f5952f 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentNonCalibratedWithOptions.cs
@@ -18,7 +18,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create data training pipeline.
             var pipeline = mlContext.BinaryClassification
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
index e98aa8af17..b65cc978ff 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticGradientDescentWithOptions.cs
@@ -18,7 +18,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var trainTestData = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var trainTestData = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Define the trainer options.
             var options = new SgdBinaryTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
index c3f1e1508e..10496e4b25 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescent.cs
@@ -17,7 +17,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
             // Create data training pipeline.
             var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent(labelColumnName: "IsOver50K", numberOfIterations: 25);
             var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
index de4f4ff386..99ea7f1460 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/SymbolicStochasticGradientDescentWithOptions.cs
@@ -17,7 +17,7 @@ public static void Example()
             var data = SamplesUtils.DatasetUtils.LoadFeaturizedAdultDataset(mlContext);
 
             // Leave out 10% of data for testing.
-            var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
             // Create data training pipeline
             var pipeline = mlContext.BinaryClassification.Trainers.SymbolicStochasticGradientDescent(
                     new ML.Trainers.SymbolicStochasticGradientDescentClassificationTrainer.Options()
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
index 399ddda16f..e7604ee5bf 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbm.cs
@@ -37,7 +37,7 @@ public static void Example()
 
             // Split the static-typed data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.5);
 
             // Train the model.
             var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
index 5af98034bf..cebdf2f4bd 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/LightGbmWithOptions.cs
@@ -48,7 +48,7 @@ public static void Example()
 
             // Split the static-typed data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.5);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.5);
 
             // Train the model.
             var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs
index d99d3368d7..b436fe502e 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscent.cs
@@ -34,7 +34,7 @@ public static void Example()
 
             // Split the data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
 
             // Train the model.
             var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs
index d803137ce4..7d65bce1db 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/StochasticDualCoordinateAscentWithOptions.cs
@@ -45,7 +45,7 @@ public static void Example()
 
             // Split the data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
 
             // Train the model.
             var model = pipeline.Fit(split.TrainSet);
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
index 475fbe23c2..8330e0e92c 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbm.cs
@@ -13,8 +13,8 @@ public static void Example()
 
             // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
             // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
-            // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose.
-            var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
+            // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
+            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
 
             // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
             var pipeline = mlContext.Ranking.Trainers.LightGbm(
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
index 101d08ec13..e0bc29a0af 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/LightGbmWithOptions.cs
@@ -16,8 +16,8 @@ public static void Example()
 
             // Leave out 10% of the dataset for testing. Since this is a ranking problem, we must ensure that the split
             // respects the GroupId column, i.e. rows with the same GroupId are either all in the train split or all in
-            // the test split. The samplingKeyColumn parameter in Ranking.TrainTestSplit is used for this purpose.
-            var split = mlContext.Ranking.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
+            // the test split. The samplingKeyColumn parameter in Data.TrainTestSplit is used for this purpose.
+            var split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumn: "GroupId");
 
             // Create the Estimator pipeline. For simplicity, we will train a small tree with 4 leaves and 2 boosting iterations.
             var pipeline = mlContext.Ranking.Trainers.LightGbm(
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
index d23aebf141..816b980c9f 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbm.cs
@@ -23,7 +23,7 @@ public static void Example()
             // 21.60              0.02731            00.00                7.070               0               0.4690          6.4210              78.90             ...
             // 34.70              0.02729            00.00                7.070               0               0.4690          7.1850              61.10             ...
 
-            var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
 
             // Create the estimator, here we only need LightGbm trainer
             // as data is already processed in a form consumable by the trainer.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
index 260c546e7f..86c37af30b 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/LightGbmWithOptions.cs
@@ -25,7 +25,7 @@ public static void Example()
             // 21.60              0.02731            00.00                7.070               0               0.4690          6.4210              78.90             ...
             // 34.70              0.02729            00.00                7.070               0               0.4690          7.1850              61.10             ...
 
-            var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
 
             // Create a pipeline with LightGbm estimator with advanced options.
             // Here we only need LightGbm trainer as data is already processed
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
index 7fe79c9a43..204322cec7 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquares.cs
@@ -39,7 +39,7 @@ public static void Example()
             // 21.60              0.02731            00.00                7.070               0               0.4690          6.4210              78.90
             // 34.70              0.02729            00.00                7.070               0               0.4690          7.1850              61.10
 
-            var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.2);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
 
             // Create the estimator, here we only need OrdinaryLeastSquares trainer 
             // as data is already processed in a form consumable by the trainer
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
index cbbd09342e..ccfa66aeb7 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/OrdinaryLeastSquaresWithOptions.cs
@@ -40,7 +40,7 @@ public static void Example()
             // 21.60              0.02731            00.00                7.070               0               0.4690          6.4210              78.90
             // 34.70              0.02729            00.00                7.070               0               0.4690          7.1850              61.10
 
-            var split = mlContext.Regression.TrainTestSplit(dataView, testFraction: 0.2);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.2);
 
             // Create the estimator, here we only need OrdinaryLeastSquares trainer 
             // as data is already processed in a form consumable by the trainer
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs
index 7ce8122f78..c97c0e7be1 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscent.cs
@@ -19,7 +19,7 @@ public static void Example()
 
             // Split the data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
 
             // Train the model.
             var pipeline = mlContext.Regression.Trainers.StochasticDualCoordinateAscent();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs
index 5cb29da11c..2f55403e9a 100644
--- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/StochasticDualCoordinateAscentWithOptions.cs
@@ -18,7 +18,7 @@ public static void Example()
 
             // Split the data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(dataView, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(dataView, testFraction: 0.1);
 
             // Create trainer options.
             var options = new SdcaRegressionTrainer.Options
diff --git a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs
index 1f471917c2..34b380d086 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/AveragedPerceptronBinaryClassification.cs
@@ -55,7 +55,7 @@ public static void AveragedPerceptronBinaryClassification()
 
             // Load the data, and leave 10% out, so we can use them for testing
             var data = loader.Load(dataFilePath);
-            var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create the Estimator
             var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs
index c9835cd0ff..f34c6c4145 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/FastTreeBinaryClassification.cs
@@ -56,7 +56,7 @@ public static void FastTreeBinaryClassification()
 
             // Loader the data, and leave 10% out, so we can use them for testing
             var data = loader.Load(dataFilePath);
-            var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create the Estimator
             var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs
index 90fa57e933..d2be520631 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMBinaryClassification.cs
@@ -56,7 +56,7 @@ public static void LightGbmBinaryClassification()
 
             // Load the data, and leave 10% out, so we can use them for testing
             var data = loader.Load(dataFilePath);
-            var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create the Estimator
             var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs
index 67ea5ce8be..fe987b434d 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMMulticlassWithInMemoryData.cs
@@ -52,7 +52,7 @@ public void MultiClassLightGbmStaticPipelineWithInMemoryData()
 
             // Split the static-typed data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(staticDataView, testFraction: 0.5);
+            var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5);
 
             // Train the model.
             var model = pipe.Fit(trainingData);
diff --git a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs
index cab1700636..344bf8bbc3 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/LightGBMRegression.cs
@@ -28,7 +28,7 @@ public static void LightGbmRegression()
 
             // Load the data, and leave 10% out, so we can use them for testing
             var data = loader.Load(new MultiFileSource(dataFile));
-            var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // The predictor that gets produced out of training
             LightGbmRegressionModelParameters pred = null;
diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs
index 8150b6400b..8caf039a3d 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/SDCABinaryClassification.cs
@@ -55,7 +55,7 @@ public static void SdcaBinaryClassification()
 
             // Load the data, and leave 10% out, so we can use them for testing
             var data = loader.Load(dataFilePath);
-            var (trainData, testData) = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Create the Estimator
             var learningPipeline = loader.MakeNewEstimator()
diff --git a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs
index e4c7188d30..ac4e6b2a7a 100644
--- a/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs
+++ b/docs/samples/Microsoft.ML.Samples/Static/SDCARegression.cs
@@ -25,7 +25,7 @@ public static void SdcaRegression()
 
             // Load the data, and leave 10% out, so we can use them for testing
             var data = loader.Load(dataFile);
-            var (trainData, testData) = mlContext.Regression.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // The predictor that gets produced out of training
             LinearRegressionModelParameters pred = null;
diff --git a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
index d40ed37a99..f9d60a3429 100644
--- a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
+++ b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
@@ -20,6 +20,31 @@ public sealed class DataOperationsCatalog : IInternalCatalog
         IHostEnvironment IInternalCatalog.Environment => _env;
         private readonly IHostEnvironment _env;
 
+        /// <summary>
+        /// A pair of datasets, for the train and test set.
+        /// </summary>
+        public struct TrainTestData
+        {
+            /// <summary>
+            /// Training set.
+            /// </summary>
+            public readonly IDataView TrainSet;
+            /// <summary>
+            /// Testing set.
+            /// </summary>
+            public readonly IDataView TestSet;
+            /// <summary>
+            /// Create pair of datasets.
+            /// </summary>
+            /// <param name="trainSet">Training set.</param>
+            /// <param name="testSet">Testing set.</param>
+            internal TrainTestData(IDataView trainSet, IDataView testSet)
+            {
+                TrainSet = trainSet;
+                TestSet = testSet;
+            }
+        }
+
         internal DataOperationsCatalog(IHostEnvironment env)
         {
             Contracts.AssertValue(env);
@@ -338,5 +363,83 @@ public IDataView TakeRows(IDataView input, long count)
 
             return new SkipTakeFilter(_env, options, input);
         }
+
+        /// <summary>
+        /// Split the dataset into the train set and test set according to the given fraction.
+        /// Respects the <paramref name="samplingKeyColumn"/> if provided.
+        /// </summary>
+        /// <param name="data">The dataset to split.</param>
+        /// <param name="testFraction">The fraction of data to go into the test set.</param>
+        /// <param name="samplingKeyColumn">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumn"/>,
+        /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
+        /// If <see langword="null"/> no row grouping will be performed.</param>
+        /// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
+        public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, uint? seed = null)
+        {
+            _env.CheckValue(data, nameof(data));
+            _env.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
+            _env.CheckValueOrNull(samplingKeyColumn);
+
+            EnsureGroupPreservationColumn(_env, ref data, ref samplingKeyColumn, seed);
+
+            var trainFilter = new RangeFilter(_env, new RangeFilter.Options()
+            {
+                Column = samplingKeyColumn,
+                Min = 0,
+                Max = testFraction,
+                Complement = true
+            }, data);
+            var testFilter = new RangeFilter(_env, new RangeFilter.Options()
+            {
+                Column = samplingKeyColumn,
+                Min = 0,
+                Max = testFraction,
+                Complement = false
+            }, data);
+
+            return new TrainTestData(trainFilter, testFilter);
+        }
+
+        /// <summary>
+        /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null.
+        /// </summary>
+        internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDataView data, ref string samplingKeyColumn, uint? seed = null)
+        {
+            // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
+            // build a single hash of it. If it is not, we generate a random number.
+
+            if (samplingKeyColumn == null)
+            {
+                samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn");
+                data = new GenerateNumberTransform(env, data, samplingKeyColumn, seed);
+            }
+            else
+            {
+                if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
+                    throw env.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
+
+                var type = data.Schema[stratCol].Type;
+                if (!RangeFilter.IsValidRangeFilterColumnType(env, type))
+                {
+                    // Hash the samplingKeyColumn.
+                    // REVIEW: this could currently crash, since Hash only accepts a limited set
+                    // of column types. It used to be HashJoin, but we should probably extend Hash
+                    // instead of having two hash transformations.
+                    var origStratCol = samplingKeyColumn;
+                    int tmp;
+                    int inc = 0;
+
+                    // Generate a new column with the hashed samplingKeyColumn.
+                    while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp))
+                        samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
+                    HashingEstimator.ColumnOptions columnOptions;
+                    if (seed.HasValue)
+                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value);
+                    else
+                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
+                    data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data);
+                }
+            }
+        }
     }
 }
diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs
index 072b924e6b..7fd623daa5 100644
--- a/src/Microsoft.ML.Data/TrainCatalog.cs
+++ b/src/Microsoft.ML.Data/TrainCatalog.cs
@@ -24,67 +24,6 @@ public abstract class TrainCatalogBase : IInternalCatalog
         [BestFriend]
         private protected IHostEnvironment Environment { get; }
 
-        /// <summary>
-        /// A pair of datasets, for the train and test set.
-        /// </summary>
-        public struct TrainTestData
-        {
-            /// <summary>
-            /// Training set.
-            /// </summary>
-            public readonly IDataView TrainSet;
-            /// <summary>
-            /// Testing set.
-            /// </summary>
-            public readonly IDataView TestSet;
-            /// <summary>
-            /// Create pair of datasets.
-            /// </summary>
-            /// <param name="trainSet">Training set.</param>
-            /// <param name="testSet">Testing set.</param>
-            internal TrainTestData(IDataView trainSet, IDataView testSet)
-            {
-                TrainSet = trainSet;
-                TestSet = testSet;
-            }
-        }
-
-        /// <summary>
-        /// Split the dataset into the train set and test set according to the given fraction.
-        /// Respects the <paramref name="samplingKeyColumn"/> if provided.
-        /// </summary>
-        /// <param name="data">The dataset to split.</param>
-        /// <param name="testFraction">The fraction of data to go into the test set.</param>
-        /// <param name="samplingKeyColumn">Name of a column to use for grouping rows. If two examples share the same value of the <paramref name="samplingKeyColumn"/>,
-        /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
-        /// If <see langword="null"/> no row grouping will be performed.</param>
-        /// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
-        public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumn = null, uint? seed = null)
-        {
-            Environment.CheckValue(data, nameof(data));
-            Environment.CheckParam(0 < testFraction && testFraction < 1, nameof(testFraction), "Must be between 0 and 1 exclusive");
-            Environment.CheckValueOrNull(samplingKeyColumn);
-
-            EnsureGroupPreservationColumn(ref data, ref samplingKeyColumn, seed);
-
-            var trainFilter = new RangeFilter(Environment, new RangeFilter.Options()
-            {
-                Column = samplingKeyColumn,
-                Min = 0,
-                Max = testFraction,
-                Complement = true
-            }, data);
-            var testFilter = new RangeFilter(Environment, new RangeFilter.Options()
-            {
-                Column = samplingKeyColumn,
-                Min = 0,
-                Max = testFraction,
-                Complement = false
-            }, data);
-
-            return new TrainTestData(trainFilter, testFilter);
-        }
-
         /// <summary>
         /// Results for specific cross-validation fold.
         /// </summary>
@@ -156,7 +95,7 @@ private protected CrossValidationResult[] CrossValidateTrain(IDataView data, IEs
             Environment.CheckParam(numFolds > 1, nameof(numFolds), "Must be more than 1");
             Environment.CheckValueOrNull(samplingKeyColumn);
 
-            EnsureGroupPreservationColumn(ref data, ref samplingKeyColumn, seed);
+            DataOperationsCatalog.EnsureGroupPreservationColumn(Environment, ref data, ref samplingKeyColumn, seed);
 
             Func<int, CrossValidationResult> foldFunction =
                 fold =>
@@ -199,48 +138,6 @@ private protected TrainCatalogBase(IHostEnvironment env, string registrationName
             Environment = env;
         }
 
-        /// <summary>
-        /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null.
-        /// </summary>
-        private void EnsureGroupPreservationColumn(ref IDataView data, ref string samplingKeyColumn, uint? seed = null)
-        {
-            // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to
-            // build a single hash of it. If it is not, we generate a random number.
-
-            if (samplingKeyColumn == null)
-            {
-                samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn");
-                data = new GenerateNumberTransform(Environment, data, samplingKeyColumn, seed);
-            }
-            else
-            {
-                if (!data.Schema.TryGetColumnIndex(samplingKeyColumn, out int stratCol))
-                    throw Environment.ExceptSchemaMismatch(nameof(samplingKeyColumn), "SamplingKeyColumn", samplingKeyColumn);
-
-                var type = data.Schema[stratCol].Type;
-                if (!RangeFilter.IsValidRangeFilterColumnType(Environment, type))
-                {
-                    // Hash the samplingKeyColumn.
-                    // REVIEW: this could currently crash, since Hash only accepts a limited set
-                    // of column types. It used to be HashJoin, but we should probably extend Hash
-                    // instead of having two hash transformations.
-                    var origStratCol = samplingKeyColumn;
-                    int tmp;
-                    int inc = 0;
-
-                    // Generate a new column with the hashed samplingKeyColumn.
-                    while (data.Schema.TryGetColumnIndex(samplingKeyColumn, out tmp))
-                        samplingKeyColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
-                    HashingEstimator.ColumnOptions columnOptions;
-                    if (seed.HasValue)
-                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, seed.Value);
-                    else
-                        columnOptions = new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30);
-                    data = new HashingEstimator(Environment, columnOptions).Fit(data).Transform(data);
-                }
-            }
-        }
-
         /// <summary>
         /// Subclasses of <see cref="TrainContext"/> will provide little "extension method" hookable objects
         /// (for example, something like <see cref="BinaryClassificationCatalog.Trainers"/>). User code will only
diff --git a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs
index bd2ccad153..2713e96f1c 100644
--- a/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs
+++ b/src/Microsoft.ML.StaticPipe/TrainingStaticExtensions.cs
@@ -30,7 +30,7 @@ public static class TrainingStaticExtensions
         /// If the <paramref name="stratificationColumn"/> is not provided, the random numbers generated to create it, will use this seed as value.
         /// And if it is not provided, the default value will be used.</param>
         /// <returns>A pair of datasets, for the train and test set.</returns>
-        public static (DataView<T> trainSet, DataView<T> testSet) TrainTestSplit<T>(this TrainCatalogBase catalog,
+        public static (DataView<T> trainSet, DataView<T> testSet) TrainTestSplit<T>(this DataOperationsCatalog catalog,
             DataView<T> data, double testFraction = 0.1, Func<T, PipelineColumn> stratificationColumn = null, uint? seed = null)
         {
             var env = StaticPipeUtils.GetEnvironment(data);
diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs
index 9bad405571..b0d1fee62a 100644
--- a/test/Microsoft.ML.Functional.Tests/Prediction.cs
+++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs
@@ -25,7 +25,7 @@ public void ReconfigurablePrediction()
             var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
                 hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
                 .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
-            var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2);
+            var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.2);
 
             // Create a pipeline to train on the housing data
             var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs
index 49a5db6693..86157e2626 100644
--- a/test/Microsoft.ML.Functional.Tests/Validation.cs
+++ b/test/Microsoft.ML.Functional.Tests/Validation.cs
@@ -65,7 +65,7 @@ public void TrainWithValidationSet()
             var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
 
             // Create the train and validation set.
-            var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2);
+            var dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.2);
             var trainData = dataSplit.TrainSet;
             var validData = dataSplit.TestSet;
 
diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
index 813ec4c352..629eec9486 100644
--- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
+++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
@@ -722,13 +722,11 @@ public void TrainTestSplit()
             var dataPath = GetDataPath(TestDatasets.iris.trainFilename);
             var dataSource = new MultiFileSource(dataPath);
 
-            var ctx = new BinaryClassificationCatalog(env);
-
             var reader = TextLoaderStatic.CreateLoader(env,
                 c => (label: c.LoadFloat(0), features: c.LoadFloat(1, 4)));
             var data = reader.Load(dataSource);
 
-            var (train, test) = ctx.TrainTestSplit(data, 0.5);
+            var (train, test) = env.Data.TrainTestSplit(data, 0.5);
 
             // Just make sure that the train is about the same size as the test set.
             var trainCount = train.GetColumn(r => r.label).Count();
@@ -737,7 +735,7 @@ public void TrainTestSplit()
             Assert.InRange(trainCount * 1.0 / testCount, 0.8, 1.2);
 
             // Now stratify by label. Silly thing to do.
-            (train, test) = ctx.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label);
+            (train, test) = env.Data.TrainTestSplit(data, 0.5, stratificationColumn: r => r.label);
             var trainLabels = train.GetColumn(r => r.label).Distinct();
             var testLabels = test.GetColumn(r => r.label).Distinct();
             Assert.True(trainLabels.Count() > 0);
diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs
index 130b4cfba3..11e76586f0 100644
--- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs
+++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs
@@ -1232,7 +1232,7 @@ public void MultiClassLightGbmStaticPipelineWithInMemoryData()
 
             // Split the static-typed data into training and test sets. Only training set is used in fitting
             // the created pipeline. Metrics are computed on the test.
-            var (trainingData, testingData) = mlContext.MulticlassClassification.TrainTestSplit(staticDataView, testFraction: 0.5);
+            var (trainingData, testingData) = mlContext.Data.TrainTestSplit(staticDataView, testFraction: 0.5);
 
             // Train the model.
             var model = pipe.Fit(trainingData);
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
index 8e7e97145c..03a36249cb 100644
--- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
@@ -601,7 +601,7 @@ private void CrossValidationOn(string dataPath)
                     Predictions: mlContext.MulticlassClassification.Trainers.Sdca(r.Label, r.Features)));
 
             // Split the data 90:10 into train and test sets, train and evaluate.
-            var (trainData, testData) = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1);
+            var (trainData, testData) = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Train the model.
             var model = pipeline.Fit(trainData);
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
index 77c6145849..bf7ab04d51 100644
--- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
@@ -426,7 +426,7 @@ private void CrossValidationOn(string dataPath)
                 .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent());
 
             // Split the data 90:10 into train and test sets, train and evaluate.
-            var split = mlContext.MulticlassClassification.TrainTestSplit(data, testFraction: 0.1);
+            var split = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
 
             // Train the model.
             var model = pipeline.Fit(split.TrainSet);
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
index a1fb6691e0..574db0f901 100644
--- a/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/Api/TestApi.cs
@@ -313,8 +313,8 @@ public void TestTrainTestSplit()
             // Let's test what train test properly works with seed.
             // In order to do that, let's split same dataset, but in one case we will use default seed value,
             // and in other case we set seed to be specific value.
-            var simpleSplit = mlContext.BinaryClassification.TrainTestSplit(input);
-            var splitWithSeed = mlContext.BinaryClassification.TrainTestSplit(input, seed: 10);
+            var simpleSplit = mlContext.Data.TrainTestSplit(input);
+            var splitWithSeed = mlContext.Data.TrainTestSplit(input, seed: 10);
 
             // Since test fraction is 0.1, it's much faster to compare test subsets of split.
             var simpleTestWorkClass = getWorkclass(simpleSplit.TestSet);
@@ -326,7 +326,7 @@ public void TestTrainTestSplit()
             // Now let's do same thing but with presence of stratificationColumn.
             // Rows with same values in this stratificationColumn should end up in same subset (train or test).
             // So let's break dataset by "Workclass" column.
-            var stratSplit = mlContext.BinaryClassification.TrainTestSplit(input, samplingKeyColumn: "Workclass");
+            var stratSplit = mlContext.Data.TrainTestSplit(input, samplingKeyColumn: "Workclass");
             var stratTrainWorkclass = getWorkclass(stratSplit.TrainSet);
             var stratTestWorkClass = getWorkclass(stratSplit.TestSet);
             // Let's get unique values for "Workclass" column from train subset.
@@ -338,7 +338,7 @@ public void TestTrainTestSplit()
 
             // Let's do same thing, but this time we will choose different seed.
             // Stratification column should still break dataset properly without same values in both subsets.
-            var stratSeed = mlContext.BinaryClassification.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000);
+            var stratSeed = mlContext.Data.TrainTestSplit(input, samplingKeyColumn:"Workclass", seed: 1000000);
             var stratTrainWithSeedWorkclass = getWorkclass(stratSeed.TrainSet);
             var stratTestWithSeedWorkClass = getWorkclass(stratSeed.TestSet);
             // Let's get unique values for "Workclass" column from train subset.