diff --git a/README.rst b/README.rst
index ee59d48..5c7e40a 100644
--- a/README.rst
+++ b/README.rst
@@ -4,10 +4,7 @@ Sklearn-pandas
This module provides a bridge between `Scikit-Learn `__'s machine learning methods and `pandas `__-style Data Frames.
-In particular, it provides:
-
-1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features.
-2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 `__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``.
+In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features.
Installation
------------
@@ -29,14 +26,9 @@ Usage
Import
******
-Import what you need from the ``sklearn_pandas`` package. The choices are:
-
-* ``DataFrameMapper``, a class for mapping pandas data frame columns to different sklearn transformations
-* ``cross_val_score``, similar to ``sklearn.cross_validation.cross_val_score`` but working on pandas DataFrames
+Import what you need from the ``sklearn_pandas`` package::
-For this demonstration, we will import both::
-
- >>> from sklearn_pandas import DataFrameMapper, cross_val_score
+ >>> from sklearn_pandas import DataFrameMapper
For these examples, we'll also use pandas, numpy, and sklearn::
@@ -210,21 +202,6 @@ Working with sparse features
The stacking of the sparse features is done without ever densifying them.
-Cross-Validation
-----------------
-
-Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``.
-
-To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array::
-
- >>> pipe = sklearn.pipeline.Pipeline([
- ... ('featurize', mapper),
- ... ('lm', sklearn.linear_model.LinearRegression())])
- >>> np.round(cross_val_score(pipe, data.copy(), data.salary, 'r2'), 2)
- array([ -1.09, -5.3 , -15.38])
-
-Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface as sklearn's function of the same name.
-
Changelog
---------
@@ -238,6 +215,10 @@ Development
the mapper. Resolves #55.
* Allow specifying an optional ``y`` argument during transform for
supervised transformations. Resolves #58.
+* Use ``FeatureUnion``s with column selector transformers to perform transformations
+ instead of custom code. This allows tuning the transformers hyper-parameters during
+ grid search and transformation using multiple jobs. Resolves #61.
+* Remove deprecated cross_validation compatibility classes and methods.
1.1.0 (2015-12-06)
@@ -278,3 +259,4 @@ Other contributors:
* Zac Stewart
* Olivier Grisel
* Vitaley Zaretskey
+* chanansh
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
index 537ab56..59f8991 100644
--- a/sklearn_pandas/__init__.py
+++ b/sklearn_pandas/__init__.py
@@ -1,4 +1,3 @@
__version__ = '1.1.0'
from .dataframe_mapper import DataFrameMapper # NOQA
-from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA
diff --git a/sklearn_pandas/cross_validation.py b/sklearn_pandas/cross_validation.py
deleted file mode 100644
index 2e5d6f9..0000000
--- a/sklearn_pandas/cross_validation.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import warnings
-from sklearn import cross_validation
-from sklearn import grid_search
-
-DEPRECATION_MSG = '''
- Custom cross-validation compatibility shims are no longer needed for
- scikit-learn>=0.16.0 and will be dropped in sklearn-pandas==2.0.
-'''
-
-
-def cross_val_score(model, X, *args, **kwargs):
- warnings.warn(DEPRECATION_MSG, DeprecationWarning)
- X = DataWrapper(X)
- return cross_validation.cross_val_score(model, X, *args, **kwargs)
-
-
-class GridSearchCV(grid_search.GridSearchCV):
- def __init__(self, *args, **kwargs):
- warnings.warn(DEPRECATION_MSG, DeprecationWarning)
- super(GridSearchCV, self).__init__(*args, **kwargs)
-
- def fit(self, X, *params, **kwparams):
- return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
-
- def predict(self, X, *params, **kwparams):
- return super(GridSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
-
-
-try:
- class RandomizedSearchCV(grid_search.RandomizedSearchCV):
- def __init__(self, *args, **kwargs):
- warnings.warn(DEPRECATION_MSG, DeprecationWarning)
- super(RandomizedSearchCV, self).__init__(*args, **kwargs)
-
- def fit(self, X, *params, **kwparams):
- return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams)
-
- def predict(self, X, *params, **kwparams):
- return super(RandomizedSearchCV, self).predict(DataWrapper(X), *params, **kwparams)
-except AttributeError:
- pass
-
-
-class DataWrapper(object):
- def __init__(self, df):
- self.df = df
-
- def __len__(self):
- return len(self.df)
-
- def __getitem__(self, key):
- return self.df.iloc[key]
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
index 2f4c364..7677072 100644
--- a/sklearn_pandas/dataframe_mapper.py
+++ b/sklearn_pandas/dataframe_mapper.py
@@ -1,32 +1,15 @@
import sys
-import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
-from .cross_validation import DataWrapper
-from .pipeline import make_transformer_pipeline, _call_fit
+from .pipeline import make_feature_union
+from .utils import handle_feature
# load in the correct stringtype: str for py3, basestring for py2
string_types = str if sys.version_info >= (3, 0) else basestring
-def _handle_feature(fea):
- """
- Convert 1-dimensional arrays to 2-dimensional column vectors.
- """
- if len(fea.shape) == 1:
- fea = np.array([fea]).T
-
- return fea
-
-
-def _build_transformer(transformers):
- if isinstance(transformers, list):
- transformers = make_transformer_pipeline(*transformers)
- return transformers
-
-
class DataFrameMapper(BaseEstimator, TransformerMixin):
"""
Map Pandas data frame column subsets to their own
@@ -51,11 +34,9 @@ def __init__(self, features, default=False, sparse=False):
sparse will return sparse matrix if set True and any of the
extracted features is sparse. Defaults to False.
"""
- if isinstance(features, list):
- features = [(columns, _build_transformer(transformers))
- for (columns, transformers) in features]
+ self.pipeline = make_feature_union(features)
self.features = features
- self.default = _build_transformer(default)
+ self.default = default
self.sparse = sparse
@property
@@ -86,45 +67,19 @@ def _unselected_columns(self, X):
column not in self._selected_columns]
def __setstate__(self, state):
+ self.features = state['features']
+
+ # compatibility for pickles before FeatureUnion
+ self.pipeline = state.get('pipeline',
+ make_feature_union(state['features']))
+
# compatibility shim for pickles created with sklearn-pandas<1.0.0
- self.features = [(columns, _build_transformer(transformers))
- for (columns, transformers) in state['features']]
self.sparse = state.get('sparse', False)
# compatibility shim for pickles created before ``default`` init
# argument existed
self.default = state.get('default', False)
- def _get_col_subset(self, X, cols):
- """
- Get a subset of columns from the given table X.
-
- X a Pandas dataframe; the table to select columns from
- cols a string or list of strings representing the columns
- to select
-
- Returns a numpy array with the data from the selected columns
- """
- return_vector = False
- if isinstance(cols, string_types):
- return_vector = True
- cols = [cols]
-
- if isinstance(X, list):
- X = [x[cols] for x in X]
- X = pd.DataFrame(X)
-
- elif isinstance(X, DataWrapper):
- # if it's a datawrapper, unwrap it
- X = X.df
-
- if return_vector:
- t = X[cols[0]].values
- else:
- t = X[cols].values
-
- return t
-
def fit(self, X, y=None):
"""
Fit a transformation from the pipeline
@@ -134,15 +89,15 @@ def fit(self, X, y=None):
y the target vector relative to X, optional
"""
- for columns, transformers in self.features:
- if transformers is not None:
- _call_fit(transformers.fit,
- self._get_col_subset(X, columns), y)
+ if self.pipeline is not None:
+ self.pipeline.fit(X, y)
# handle features not explicitly selected
- if self.default: # not False and not None
- _call_fit(self.default.fit,
- self._get_col_subset(X, self._unselected_columns(X)), y)
+ if self.default is not False:
+ # build JIT pipeline
+ default_features = [(self._unselected_columns(X), self.default)]
+ self.default_pipeline = make_feature_union(default_features)
+ self.default_pipeline.fit(X, y)
return self
def transform(self, X):
@@ -152,22 +107,13 @@ def transform(self, X):
X the data to transform
"""
extracted = []
- for columns, transformers in self.features:
- # columns could be a string or list of
- # strings; we don't care because pandas
- # will handle either.
- Xt = self._get_col_subset(X, columns)
- if transformers is not None:
- Xt = transformers.transform(Xt)
- extracted.append(_handle_feature(Xt))
+ if self.pipeline is not None: # some columns selected
+ extracted.append(handle_feature(self.pipeline.transform(X)))
# handle features not explicitly selected
if self.default is not False:
- Xt = self._get_col_subset(X, self._unselected_columns(X))
- if self.default is not None:
- Xt = self.default.transform(Xt)
- extracted.append(_handle_feature(Xt))
-
+ Xt = self.default_pipeline.transform(X)
+ extracted.append(handle_feature(Xt))
# combine the feature outputs into one array.
# at this point we lose track of which features
@@ -186,3 +132,15 @@ def transform(self, X):
stacked = np.hstack(extracted)
return stacked
+
+ def get_params(self, deep=True):
+ base_params = super(DataFrameMapper, self).get_params(deep=False)
+ if not deep:
+ return base_params
+ else:
+ fu_params = self.pipeline.get_params(deep=True)
+ fu_params.update(base_params)
+ return fu_params
+
+ def set_params(self, **params):
+ return self.pipeline.set_params(**params)
\ No newline at end of file
diff --git a/sklearn_pandas/dataframe_mapper_pipeline.py b/sklearn_pandas/dataframe_mapper_pipeline.py
new file mode 100644
index 0000000..184a0aa
--- /dev/null
+++ b/sklearn_pandas/dataframe_mapper_pipeline.py
@@ -0,0 +1,76 @@
+'''
+an alternative implementation which uses just sklearn Pipeline and FeatureUnion.
+This makes the resultant transformer more compatible with other scikit-learn APIs.
+'''
+from sklearn.base import BaseEstimator, TransformerMixin
+import pandas as pd
+import numpy as np
+from sklearn.pipeline import FeatureUnion
+from .pipeline import TransformerPipeline
+
+
+def _handle_feature(fea):
+ """
+ Convert 1-dimensional arrays to 2-dimensional column vectors.
+ """
+ if len(fea.shape) == 1:
+ fea = np.array([fea]).T
+
+ return fea
+
+
+
+
+
+
+
+import unittest
+
+
+class TestPipelineMapping(unittest.TestCase):
+ def setUp(self):
+ from sklearn.datasets import load_boston
+ data = load_boston()
+ fm = data['data']
+ y = data['target']
+ columns = data['feature_names']
+ df = pd.DataFrame(fm, columns=columns)
+ self.df = df
+ self.y = y
+ from sklearn.preprocessing import StandardScaler
+ from sklearn.preprocessing import OneHotEncoder
+ self.mapping = [(['AGE'], StandardScaler()),
+ (['RAD'], OneHotEncoder(handle_unknown="ignore"))
+ ]
+
+ def test_make_pipe(self):
+ try:
+ pipeline = mapping_to_pipeline(mapping=self.mapping)
+ except Exception as e:
+ self.fail('Unexpected exception raised:', e)
+ self.assertTrue(isinstance(pipeline, FeatureUnion))
+
+ def test_transform(self):
+ pipeline = mapping_to_pipeline(mapping=self.mapping)
+ n_unique = self.df.apply(lambda x: x.nunique())
+ try:
+ transformed = pipeline.fit_transform(self.df)
+ except Exception as e:
+ self.fail('Unexpected exception raised:', e)
+ self.assertEqual(self.df.shape[0], transformed.shape[0])
+ self.assertEqual(n_unique['RAD'] + 1, transformed.shape[1])
+
+ def test_pipe_cv(self):
+ pipeline = mapping_to_pipeline(mapping=self.mapping)
+ from sklearn.linear_model import LinearRegression
+ from sklearn.pipeline import Pipeline, make_pipeline
+ full_pipeline = make_pipeline(pipeline, LinearRegression())
+ from sklearn.cross_validation import cross_val_score
+ try:
+ scores = cross_val_score(full_pipeline, self.df, self.y)
+ except Exception as e:
+ self.fail('Unexpected exception raised:', e)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/sklearn_pandas/pipeline.py b/sklearn_pandas/pipeline.py
index 13fa23a..d80b881 100644
--- a/sklearn_pandas/pipeline.py
+++ b/sklearn_pandas/pipeline.py
@@ -1,7 +1,10 @@
import six
-from sklearn.pipeline import _name_estimators, Pipeline
+import collections
+from sklearn.pipeline import Pipeline, FeatureUnion, _name_estimators
from sklearn.utils import tosequence
+from .utils import PassThroughTransformer, ColumnSelectTransformer
+
def _call_fit(fit_method, X, y=None, **kwargs):
"""
@@ -56,6 +59,12 @@ def __init__(self, steps):
"'%s' (type %s) doesn't)"
% (estimator, type(estimator)))
+ # needed for compatibility with sklearn<=0.16, that doesn't have
+ # this property defined
+ @property
+ def named_steps(self):
+ return dict(self.steps)
+
def _pre_transform(self, X, y=None, **fit_params):
fit_params_steps = dict((step, {}) for step, _ in self.steps)
for pname, pval in six.iteritems(fit_params):
@@ -86,7 +95,64 @@ def fit_transform(self, X, y=None, **fit_params):
Xt, y, **fit_params).transform(Xt)
-def make_transformer_pipeline(*steps):
- """Construct a TransformerPipeline from the given estimators.
+def make_transformer_pipeline(feature_selector, transformers, feature_name):
+ if not isinstance(transformers, list):
+ transformers = [transformers]
+ # transform None into PassThroughTransformer
+ transformers = [PassThroughTransformer() if t is None else t
+ for t in transformers]
+ cst = ColumnSelectTransformer
+ selector = [('selector', cst(feature_selector))]
+ # pipeline of selector followed by transformer
+ pipe = TransformerPipeline(selector + _name_estimators(transformers))
+ return (feature_name, pipe)
+
+
+def make_feature_union(mapping, n_jobs=1):
+ """
+ Create a FeatureUnion from the specified mapping.
+
+ Creates a FeatureUnion of TransformerPipelines that select the columns
+ given in the mapping as first step, then apply the specified transformers
+ sequentially.
+
+ :param mapping: a list of tuples where the first is the column name(s) and
+ the other is the transormation or list of transformation to apply.
+ See ``DataFrameMapper`` for more information.
+ :param n_jobs: number of jobs to run in parallel (default 1)
+ """
+ feature_names = get_feature_names(mapping)
+ # repeated feature names are not allowed, since they collide when
+ # doing set_params()
+ dupe_feat_names = [item for item, count in
+ collections.Counter(feature_names).items() if count > 1]
+ if len(dupe_feat_names):
+ raise ValueError(
+ 'Duplicated feature column names found: {}. Please '
+ 'provide custom feature names to '
+ 'disambiguate.'.format(dupe_feat_names))
+
+ feature_selectors = [el[0] for el in mapping]
+ transformers = [el[1] for el in mapping]
+ transformer_pipes = [
+ make_transformer_pipeline(feature_selector, transformers, feature_name)
+ for feature_selector, transformers, feature_name in
+ zip(feature_selectors, transformers, feature_names)]
+
+ if transformer_pipes: # at least one column to be transformed
+ feature_union = FeatureUnion(transformer_pipes, n_jobs=n_jobs)
+ else: # case when no columns were selected, but specifying default
+ feature_union = None
+ return feature_union
+
+
+def get_feature_names(mapping):
+ """
+ Derive feature names from given feature definition mapping.
+
+ By default, it takes the string representation of selected column(s) names,
+ but a custom name can be provided as the third argument of the feature
+ definition tuple.
"""
- return TransformerPipeline(_name_estimators(steps))
+ return [feat_def[2] if len(feat_def) == 3 else str(feat_def[0])
+ for feat_def in mapping]
\ No newline at end of file
diff --git a/sklearn_pandas/utils.py b/sklearn_pandas/utils.py
new file mode 100644
index 0000000..547a3cc
--- /dev/null
+++ b/sklearn_pandas/utils.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+def handle_feature(fea):
+ """
+ Convert 1-dimensional arrays to 2-dimensional column vectors.
+ """
+ if len(fea.shape) == 1:
+ fea = np.array([fea]).T
+
+ return fea
+
+
+class PassThroughTransformer(BaseEstimator, TransformerMixin):
+ """
+ Transformer that just returns the selected column(s) oriented vertically.
+ """
+ def fit(self, X, y=None):
+ return self
+
+ def transform(self, X, y=None):
+ return handle_feature(X)
+
+
+class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
+ """
+ A simple Transformer which selects a column or a group of columns from a
+ Pandas' DataFrame
+ """
+ def __init__(self, column_name):
+ """
+ A Transformer which selects a column or a group of columns from a Pandas' DataFrame
+ :param column_name: string or list of strings of columns to select
+ """
+ self.column_name = column_name
+
+ def fit(self, X, y=None):
+ if not (isinstance(X, pd.DataFrame) or isinstance(X, pd.Series)):
+ raise TypeError('Input should be a Pandas DataFrame or a Series (was %s)' % type(X))
+ column_name = self.column_name
+ # in case in bracketed as [] to output a (n,1) rather (n,) shape
+ if not isinstance(column_name, list):
+ column_name = [column_name]
+ for name in column_name:
+ if name not in X.columns:
+ raise ValueError('Select column name %s is not in %s' % (name, X.columns))
+ return self
+
+ def transform(self, X, y=None):
+ return X[self.column_name].values
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
index 08454fb..fa715e0 100644
--- a/tests/test_dataframe_mapper.py
+++ b/tests/test_dataframe_mapper.py
@@ -1,5 +1,4 @@
import pytest
-from pkg_resources import parse_version
# In py3, mock is included with the unittest standard library
# In py2, it's a separate package
@@ -11,10 +10,9 @@
from pandas import DataFrame
import pandas as pd
from scipy import sparse
-from sklearn import __version__ as sklearn_version
-from sklearn.cross_validation import cross_val_score as sklearn_cv_score
+from sklearn.cross_validation import cross_val_score
from sklearn.datasets import load_iris
-from sklearn.pipeline import Pipeline
+from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
@@ -24,9 +22,8 @@
from numpy.testing import assert_array_equal
import pickle
-from sklearn_pandas import DataFrameMapper, cross_val_score
-from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer
-from sklearn_pandas.pipeline import TransformerPipeline
+from sklearn_pandas.dataframe_mapper import DataFrameMapper
+from sklearn_pandas.utils import handle_feature
class MockXTransformer(object):
@@ -77,38 +74,6 @@ def complex_dataframe():
'feat2': [1, 2, 3, 2, 3, 4]})
-def test_nonexistent_columns_explicit_fail(simple_dataframe):
- """
- If a nonexistent column is selected, KeyError is raised.
- """
- mapper = DataFrameMapper(None)
- with pytest.raises(KeyError):
- mapper._get_col_subset(simple_dataframe, ['nonexistent_feature'])
-
-
-def test_get_col_subset_single_column_array(simple_dataframe):
- """
- Selecting a single column should return a 1-dimensional numpy array.
- """
- mapper = DataFrameMapper(None)
- array = mapper._get_col_subset(simple_dataframe, "a")
-
- assert type(array) == np.ndarray
- assert array.shape == (len(simple_dataframe["a"]),)
-
-
-def test_get_col_subset_single_column_list(simple_dataframe):
- """
- Selecting a list of columns (even if the list contains a single element)
- should return a 2-dimensional numpy array.
- """
- mapper = DataFrameMapper(None)
- array = mapper._get_col_subset(simple_dataframe, ["a"])
-
- assert type(array) == np.ndarray
- assert array.shape == (len(simple_dataframe["a"]), 1)
-
-
def test_cols_string_array(simple_dataframe):
"""
If an string specified as the columns, the transformer
@@ -144,7 +109,7 @@ def test_handle_feature_2dim():
2-dimensional arrays are returned unchanged.
"""
array = np.array([[1, 2], [3, 4]])
- assert_array_equal(_handle_feature(array), array)
+ assert_array_equal(handle_feature(array), array)
def test_handle_feature_1dim():
@@ -152,19 +117,7 @@ def test_handle_feature_1dim():
1-dimensional arrays are converted to 2-dimensional column vectors.
"""
array = np.array([1, 2])
- assert_array_equal(_handle_feature(array), np.array([[1], [2]]))
-
-
-def test_build_transformers():
- """
- When a list of transformers is passed, return a pipeline with
- each element of the iterable as a step of the pipeline.
- """
- transformers = [MockTClassifier(), MockTClassifier()]
- pipeline = _build_transformer(transformers)
- assert isinstance(pipeline, Pipeline)
- for ix, transformer in enumerate(transformers):
- assert pipeline.steps[ix][1] == transformer
+ assert_array_equal(handle_feature(array), np.array([[1], [2]]))
def test_selected_columns():
@@ -228,7 +181,7 @@ def test_default_transformer():
mapper = DataFrameMapper([], default=Imputer())
transformed = mapper.fit_transform(df)
- assert (transformed[: 0] == np.array([1., 2., 3.])).all()
+ assert (transformed[:0] == np.array([1., 2., 3.])).all()
def test_list_transformers_single_arg(simple_dataframe):
@@ -264,15 +217,23 @@ def test_list_transformers():
def test_list_transformers_old_unpickle(simple_dataframe):
- mapper = DataFrameMapper(None)
+ mapper = DataFrameMapper([('a', [MockXTransformer()])])
# simulate the mapper was created with < 1.0.0 code
mapper.features = [('a', [MockXTransformer()])]
mapper_pickled = pickle.dumps(mapper)
loaded_mapper = pickle.loads(mapper_pickled)
- transformer = loaded_mapper.features[0][1]
- assert isinstance(transformer, TransformerPipeline)
- assert isinstance(transformer.steps[0][1], MockXTransformer)
+ assert isinstance(loaded_mapper.pipeline, FeatureUnion)
+
+
+def test_list_transformers_nofeatunion_unpickle(simple_dataframe):
+ mapper = DataFrameMapper([('a', [MockXTransformer()])])
+ # simulate the mapper was created with < 1.0.0 code
+ del mapper.pipeline
+ mapper_pickled = pickle.dumps(mapper)
+
+ loaded_mapper = pickle.loads(mapper_pickled)
+ assert isinstance(loaded_mapper.pipeline, FeatureUnion)
def test_default_old_unpickle(simple_dataframe):
@@ -396,23 +357,26 @@ def test_with_car_dataframe(cars_dataframe):
assert scores.mean() > 0.30
-@pytest.mark.skipIf(parse_version(sklearn_version) < parse_version('0.16'))
-def test_direct_cross_validation(iris_dataframe):
- """
- Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes.
- See https://github.com/paulgb/sklearn-pandas/issues/11
- """
+def test_get_params():
pipeline = Pipeline([
("preprocess", DataFrameMapper([
- ("petal length (cm)", None),
- ("petal width (cm)", None),
- ("sepal length (cm)", None),
- ("sepal width (cm)", None),
+ ("description", CountVectorizer()),
])),
("classify", SVC(kernel='linear'))
])
- data = iris_dataframe.drop("species", axis=1)
- labels = iris_dataframe["species"]
- scores = sklearn_cv_score(pipeline, data, labels)
- assert scores.mean() > 0.96
- assert (scores.std() * 2) < 0.04
+ assert ('preprocess__description__countvectorizer__analyzer' in
+ pipeline.get_params())
+
+
+def test_set_params():
+ pipeline = Pipeline([
+ ("preprocess", DataFrameMapper([
+ ("description", CountVectorizer()),
+ ])),
+ ("classify", SVC(kernel='linear'))
+ ])
+ new_par = {'preprocess__description__countvectorizer__analyzer': 'another'}
+ pipeline.set_params(**new_par)
+ params = pipeline.get_params()
+ for k, v in new_par.items():
+ assert params[k] == v
\ No newline at end of file
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index ee57b57..f447197 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -1,5 +1,8 @@
+import pandas as pd
import pytest
-from sklearn_pandas.pipeline import TransformerPipeline, _call_fit
+
+from sklearn_pandas.pipeline import (
+ TransformerPipeline, _call_fit, make_feature_union, get_feature_names)
# In py3, mock is included with the unittest standard library
# In py2, it's a separate package
@@ -9,6 +12,11 @@
from mock import patch
+@pytest.fixture
+def df():
+ return pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
+
+
class NoTransformT(object):
"""Transformer without transform method.
"""
@@ -98,3 +106,26 @@ def test_raises_type_error(mock_fit):
"""
with pytest.raises(TypeError):
_call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
+
+
+def test_make_feature_union_repeated_names(df):
+ mapping = [
+ ('a', None),
+ ('a', None)
+ ]
+ with pytest.raises(ValueError):
+ mapper = make_feature_union(mapping)
+
+
+def test_make_feature_union_repeated_names_custom_name(df):
+ mapping = [
+ ('a', None),
+ ('a', None, 'a_dupe')
+ ]
+ mapper = make_feature_union(mapping)
+
+
+def test_get_feature_names():
+ mapping = [(['a'], None), ('b', None, 'beta')]
+ names = get_feature_names(mapping)
+ assert names == ["['a']", 'beta']
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..24e18e7
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import numpy as np
+
+import pytest
+
+from sklearn_pandas.utils import (
+ ColumnSelectTransformer, PassThroughTransformer, handle_feature)
+
+
+@pytest.fixture
+def df():
+ return pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
+
+
+def test_column_select_1d(df):
+ cst = ColumnSelectTransformer('a')
+ selected = cst.fit_transform(df)
+ assert (selected == np.array([1, 2])).all()
+
+
+def test_column_select_2d(df):
+ cst = ColumnSelectTransformer(['a'])
+ selected = cst.fit_transform(df)
+ assert (selected == np.array([[1], [2]])).all()
+
+
+def test_column_select_nonframe():
+ """
+ ColumnSelectTransformer only works with Series or DataFrames.
+ """
+ cst = ColumnSelectTransformer('a')
+ with pytest.raises(TypeError):
+ cst.fit_transform({})
+
+
+def test_column_select_nonexistent(df):
+ """
+ Trying to select an unexistent column raises ValueError.
+ """
+ cst = ColumnSelectTransformer(['z'])
+ with pytest.raises(ValueError):
+ selected = cst.fit_transform(df)
+
+
+def test_passthrough_transformer(df):
+ pt = PassThroughTransformer()
+ result = pt.fit_transform(np.array([1, 2]))
+ assert (result == np.array([[1], [2]])).all()
+
+
+def test_handle_feature():
+ feature = np.array([1, 2, 3])
+ assert handle_feature(feature).shape == (3, 1)
+
+ feature = np.array([[1], [2], [3]])
+ assert handle_feature(feature).shape == (3, 1)
\ No newline at end of file