scikit-learn-contrib · dukebody · Aug 2, 2016 · Aug 4, 2016 · Aug 26, 2016 · Aug 27, 2016
diff --git a/README.rst b/README.rst
@@ -4,10 +4,7 @@ Sklearn-pandas
 
 This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable/>`__'s machine learning methods and `pandas <http://pandas.pydata.org/>`__-style Data Frames.
 
-In particular, it provides:
-
-1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
-2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 <https://github.com/paulgb/sklearn-pandas/issues/11>`__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
+In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features.
 
 Installation
 ------------
@@ -29,14 +26,9 @@ Usage
 Import
 ******
 
-Import what you need from the ``sklearn_pandas`` package. The choices are:
-
-* ``DataFrameMapper``, a class for mapping pandas data frame columns to different sklearn transformations
-* ``cross_val_score``, similar to ``sklearn.cross_validation.cross_val_score`` but working on pandas DataFrames
+Import what you need from the ``sklearn_pandas`` package::
 
-For this demonstration, we will import both::
-
-    >>> from sklearn_pandas import DataFrameMapper, cross_val_score
+    >>> from sklearn_pandas import DataFrameMapper
 
 For these examples, we'll also use pandas, numpy, and sklearn::
 
@@ -210,21 +202,6 @@ Working with sparse features
 
 The stacking of the sparse features is done without ever densifying them.
 
-Cross-Validation
-----------------
-
-Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
-
-To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array::
-
-    >>> pipe = sklearn.pipeline.Pipeline([
-    ...     ('featurize', mapper),
-    ...     ('lm', sklearn.linear_model.LinearRegression())])
-    >>> np.round(cross_val_score(pipe, data.copy(), data.salary, 'r2'), 2)
-    array([ -1.09,  -5.3 , -15.38])
-
-Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface as sklearn's function of the same name.
-
 
 Changelog
 ---------
@@ -238,6 +215,10 @@ Development
   the mapper. Resolves #55.
 * Allow specifying an optional ``y`` argument during transform for
   supervised transformations. Resolves #58.
+* Use ``FeatureUnion``s with column selector transformers to perform transformations
+  instead of custom code. This allows tuning the transformers hyper-parameters during
+  grid search and transformation using multiple jobs. Resolves #61.
+* Remove deprecated cross_validation compatibility classes and methods.
 
 
 1.1.0 (2015-12-06)
@@ -278,3 +259,4 @@ Other contributors:
 * Zac Stewart
 * Olivier Grisel
 * Vitaley Zaretskey
+* chanansh
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -1,4 +1,3 @@
 __version__ = '1.1.0'
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
-from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
diff --git a/sklearn_pandas/cross_validation.py b/sklearn_pandas/cross_validation.py
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -1,32 +1,15 @@
 import sys
-import pandas as pd
 import numpy as np
 from scipy import sparse
 from sklearn.base import BaseEstimator, TransformerMixin
 
-from .cross_validation import DataWrapper
-from .pipeline import make_transformer_pipeline, _call_fit
+from .pipeline import make_feature_union
+from .utils import handle_feature
 
 # load in the correct stringtype: str for py3, basestring for py2
 string_types = str if sys.version_info >= (3, 0) else basestring
 
 
-def _handle_feature(fea):
-    """
-    Convert 1-dimensional arrays to 2-dimensional column vectors.
-    """
-    if len(fea.shape) == 1:
-        fea = np.array([fea]).T
-
-    return fea
-
-
-def _build_transformer(transformers):
-    if isinstance(transformers, list):
-        transformers = make_transformer_pipeline(*transformers)
-    return transformers
-
-
 class DataFrameMapper(BaseEstimator, TransformerMixin):
     """
     Map Pandas data frame column subsets to their own
@@ -51,11 +34,9 @@ def __init__(self, features, default=False, sparse=False):
         sparse      will return sparse matrix if set True and any of the
                     extracted features is sparse. Defaults to False.
         """
-        if isinstance(features, list):
-            features = [(columns, _build_transformer(transformers))
-                        for (columns, transformers) in features]
+        self.pipeline = make_feature_union(features)
         self.features = features
-        self.default = _build_transformer(default)
+        self.default = default
         self.sparse = sparse
 
     @property
@@ -86,45 +67,19 @@ def _unselected_columns(self, X):
                 column not in self._selected_columns]
 
     def __setstate__(self, state):
+        self.features = state['features']
+
+        # compatibility for pickles before FeatureUnion
+        self.pipeline = state.get('pipeline',
+                                  make_feature_union(state['features']))
+
         # compatibility shim for pickles created with sklearn-pandas<1.0.0
-        self.features = [(columns, _build_transformer(transformers))
-                         for (columns, transformers) in state['features']]
         self.sparse = state.get('sparse', False)
 
         # compatibility shim for pickles created before ``default`` init
         # argument existed
         self.default = state.get('default', False)
 
-    def _get_col_subset(self, X, cols):
-        """
-        Get a subset of columns from the given table X.
-
-        X       a Pandas dataframe; the table to select columns from
-        cols    a string or list of strings representing the columns
-                to select
-
-        Returns a numpy array with the data from the selected columns
-        """
-        return_vector = False
-        if isinstance(cols, string_types):
-            return_vector = True
-            cols = [cols]
-
-        if isinstance(X, list):
-            X = [x[cols] for x in X]
-            X = pd.DataFrame(X)
-
-        elif isinstance(X, DataWrapper):
-            # if it's a datawrapper, unwrap it
-            X = X.df
-
-        if return_vector:
-            t = X[cols[0]].values
-        else:
-            t = X[cols].values
-
-        return t
-
     def fit(self, X, y=None):
         """
         Fit a transformation from the pipeline
@@ -134,15 +89,15 @@ def fit(self, X, y=None):
         y       the target vector relative to X, optional
 
         """
-        for columns, transformers in self.features:
-            if transformers is not None:
-                _call_fit(transformers.fit,
-                          self._get_col_subset(X, columns), y)
+        if self.pipeline is not None:
+            self.pipeline.fit(X, y)
 
         # handle features not explicitly selected
-        if self.default:  # not False and not None
-            _call_fit(self.default.fit,
-                      self._get_col_subset(X, self._unselected_columns(X)), y)
+        if self.default is not False:
+            # build JIT pipeline
+            default_features = [(self._unselected_columns(X), self.default)]
+            self.default_pipeline = make_feature_union(default_features)
+            self.default_pipeline.fit(X, y)
         return self
 
     def transform(self, X):
@@ -152,22 +107,13 @@ def transform(self, X):
         X       the data to transform
         """
         extracted = []
-        for columns, transformers in self.features:
-            # columns could be a string or list of
-            # strings; we don't care because pandas
-            # will handle either.
-            Xt = self._get_col_subset(X, columns)
-            if transformers is not None:
-                Xt = transformers.transform(Xt)
-            extracted.append(_handle_feature(Xt))
+        if self.pipeline is not None:  # some columns selected
+            extracted.append(handle_feature(self.pipeline.transform(X)))
 
         # handle features not explicitly selected
         if self.default is not False:
-            Xt = self._get_col_subset(X, self._unselected_columns(X))
-            if self.default is not None:
-                Xt = self.default.transform(Xt)
-            extracted.append(_handle_feature(Xt))
-
+            Xt = self.default_pipeline.transform(X)
+            extracted.append(handle_feature(Xt))
 
         # combine the feature outputs into one array.
         # at this point we lose track of which features
@@ -186,3 +132,15 @@ def transform(self, X):
             stacked = np.hstack(extracted)
 
         return stacked
+
+    def get_params(self, deep=True):
+        base_params = super(DataFrameMapper, self).get_params(deep=False)
+        if not deep:
+            return base_params
+        else:
+            fu_params = self.pipeline.get_params(deep=True)
+            fu_params.update(base_params)
+            return fu_params
+
+    def set_params(self, **params):
+        return self.pipeline.set_params(**params)
diff --git a/sklearn_pandas/dataframe_mapper_pipeline.py b/sklearn_pandas/dataframe_mapper_pipeline.py
@@ -0,0 +1,76 @@
+'''
+an alternative implementation which uses just sklearn Pipeline and FeatureUnion.
+This makes the resultant transformer more compatible with other scikit-learn APIs.
+'''
+from sklearn.base import BaseEstimator, TransformerMixin
+import pandas as pd
+import numpy as np
+from sklearn.pipeline import FeatureUnion
+from .pipeline import TransformerPipeline
+
+
+def _handle_feature(fea):
+    """
+    Convert 1-dimensional arrays to 2-dimensional column vectors.
+    """
+    if len(fea.shape) == 1:
+        fea = np.array([fea]).T
+
+    return fea
+
+
+
+
+
+
+
+import unittest
+
+
+class TestPipelineMapping(unittest.TestCase):
+    def setUp(self):
+        from sklearn.datasets import load_boston
+        data = load_boston()
+        fm = data['data']
+        y = data['target']
+        columns = data['feature_names']
+        df = pd.DataFrame(fm, columns=columns)
+        self.df = df
+        self.y = y
+        from sklearn.preprocessing import StandardScaler
+        from sklearn.preprocessing import OneHotEncoder
+        self.mapping = [(['AGE'], StandardScaler()),
+                        (['RAD'], OneHotEncoder(handle_unknown="ignore"))
+                        ]
+
+    def test_make_pipe(self):
+        try:
+            pipeline = mapping_to_pipeline(mapping=self.mapping)
+        except Exception as e:
+            self.fail('Unexpected exception raised:', e)
+        self.assertTrue(isinstance(pipeline, FeatureUnion))
+
+    def test_transform(self):
+        pipeline = mapping_to_pipeline(mapping=self.mapping)
+        n_unique = self.df.apply(lambda x: x.nunique())
+        try:
+            transformed = pipeline.fit_transform(self.df)
+        except Exception as e:
+            self.fail('Unexpected exception raised:', e)
+        self.assertEqual(self.df.shape[0], transformed.shape[0])
+        self.assertEqual(n_unique['RAD'] + 1, transformed.shape[1])
+
+    def test_pipe_cv(self):
+        pipeline = mapping_to_pipeline(mapping=self.mapping)
+        from sklearn.linear_model import LinearRegression
+        from sklearn.pipeline import Pipeline, make_pipeline
+        full_pipeline = make_pipeline(pipeline, LinearRegression())
+        from sklearn.cross_validation import cross_val_score
+        try:
+            scores = cross_val_score(full_pipeline, self.df, self.y)
+        except Exception as e:
+            self.fail('Unexpected exception raised:', e)
+
+
+if __name__ == '__main__':
+    unittest.main()