diff --git a/README.rst b/README.rst index ee59d48..5c7e40a 100644 --- a/README.rst +++ b/README.rst @@ -4,10 +4,7 @@ Sklearn-pandas This module provides a bridge between `Scikit-Learn `__'s machine learning methods and `pandas `__-style Data Frames. -In particular, it provides: - -1. A way to map ``DataFrame`` columns to transformations, which are later recombined into features. -2. A compatibility shim for old ``scikit-learn`` versions to cross-validate a pipeline that takes a pandas ``DataFrame`` as input. This is only needed for ``scikit-learn<0.16.0`` (see `#11 `__ for details). It is deprecated and will likely be dropped in ``skearn-pandas==2.0``. +In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features. Installation ------------ @@ -29,14 +26,9 @@ Usage Import ****** -Import what you need from the ``sklearn_pandas`` package. The choices are: - -* ``DataFrameMapper``, a class for mapping pandas data frame columns to different sklearn transformations -* ``cross_val_score``, similar to ``sklearn.cross_validation.cross_val_score`` but working on pandas DataFrames +Import what you need from the ``sklearn_pandas`` package:: -For this demonstration, we will import both:: - - >>> from sklearn_pandas import DataFrameMapper, cross_val_score + >>> from sklearn_pandas import DataFrameMapper For these examples, we'll also use pandas, numpy, and sklearn:: @@ -210,21 +202,6 @@ Working with sparse features The stacking of the sparse features is done without ever densifying them. -Cross-Validation ----------------- - -Now that we can combine features from pandas DataFrames, we may want to use cross-validation to see whether our model works. ``scikit-learn<0.16.0`` provided features for cross-validation, but they expect numpy data structures and won't work with ``DataFrameMapper``. - -To get around this, sklearn-pandas provides a wrapper on sklearn's ``cross_val_score`` function which passes a pandas DataFrame to the estimator rather than a numpy array:: - - >>> pipe = sklearn.pipeline.Pipeline([ - ... ('featurize', mapper), - ... ('lm', sklearn.linear_model.LinearRegression())]) - >>> np.round(cross_val_score(pipe, data.copy(), data.salary, 'r2'), 2) - array([ -1.09, -5.3 , -15.38]) - -Sklearn-pandas' ``cross_val_score`` function provides exactly the same interface as sklearn's function of the same name. - Changelog --------- @@ -238,6 +215,10 @@ Development the mapper. Resolves #55. * Allow specifying an optional ``y`` argument during transform for supervised transformations. Resolves #58. +* Use ``FeatureUnion``s with column selector transformers to perform transformations + instead of custom code. This allows tuning the transformers hyper-parameters during + grid search and transformation using multiple jobs. Resolves #61. +* Remove deprecated cross_validation compatibility classes and methods. 1.1.0 (2015-12-06) @@ -278,3 +259,4 @@ Other contributors: * Zac Stewart * Olivier Grisel * Vitaley Zaretskey +* chanansh diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index 537ab56..59f8991 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -1,4 +1,3 @@ __version__ = '1.1.0' from .dataframe_mapper import DataFrameMapper # NOQA -from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA diff --git a/sklearn_pandas/cross_validation.py b/sklearn_pandas/cross_validation.py deleted file mode 100644 index 2e5d6f9..0000000 --- a/sklearn_pandas/cross_validation.py +++ /dev/null @@ -1,52 +0,0 @@ -import warnings -from sklearn import cross_validation -from sklearn import grid_search - -DEPRECATION_MSG = ''' - Custom cross-validation compatibility shims are no longer needed for - scikit-learn>=0.16.0 and will be dropped in sklearn-pandas==2.0. -''' - - -def cross_val_score(model, X, *args, **kwargs): - warnings.warn(DEPRECATION_MSG, DeprecationWarning) - X = DataWrapper(X) - return cross_validation.cross_val_score(model, X, *args, **kwargs) - - -class GridSearchCV(grid_search.GridSearchCV): - def __init__(self, *args, **kwargs): - warnings.warn(DEPRECATION_MSG, DeprecationWarning) - super(GridSearchCV, self).__init__(*args, **kwargs) - - def fit(self, X, *params, **kwparams): - return super(GridSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - def predict(self, X, *params, **kwparams): - return super(GridSearchCV, self).predict(DataWrapper(X), *params, **kwparams) - - -try: - class RandomizedSearchCV(grid_search.RandomizedSearchCV): - def __init__(self, *args, **kwargs): - warnings.warn(DEPRECATION_MSG, DeprecationWarning) - super(RandomizedSearchCV, self).__init__(*args, **kwargs) - - def fit(self, X, *params, **kwparams): - return super(RandomizedSearchCV, self).fit(DataWrapper(X), *params, **kwparams) - - def predict(self, X, *params, **kwparams): - return super(RandomizedSearchCV, self).predict(DataWrapper(X), *params, **kwparams) -except AttributeError: - pass - - -class DataWrapper(object): - def __init__(self, df): - self.df = df - - def __len__(self): - return len(self.df) - - def __getitem__(self, key): - return self.df.iloc[key] diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index 2f4c364..7677072 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -1,32 +1,15 @@ import sys -import pandas as pd import numpy as np from scipy import sparse from sklearn.base import BaseEstimator, TransformerMixin -from .cross_validation import DataWrapper -from .pipeline import make_transformer_pipeline, _call_fit +from .pipeline import make_feature_union +from .utils import handle_feature # load in the correct stringtype: str for py3, basestring for py2 string_types = str if sys.version_info >= (3, 0) else basestring -def _handle_feature(fea): - """ - Convert 1-dimensional arrays to 2-dimensional column vectors. - """ - if len(fea.shape) == 1: - fea = np.array([fea]).T - - return fea - - -def _build_transformer(transformers): - if isinstance(transformers, list): - transformers = make_transformer_pipeline(*transformers) - return transformers - - class DataFrameMapper(BaseEstimator, TransformerMixin): """ Map Pandas data frame column subsets to their own @@ -51,11 +34,9 @@ def __init__(self, features, default=False, sparse=False): sparse will return sparse matrix if set True and any of the extracted features is sparse. Defaults to False. """ - if isinstance(features, list): - features = [(columns, _build_transformer(transformers)) - for (columns, transformers) in features] + self.pipeline = make_feature_union(features) self.features = features - self.default = _build_transformer(default) + self.default = default self.sparse = sparse @property @@ -86,45 +67,19 @@ def _unselected_columns(self, X): column not in self._selected_columns] def __setstate__(self, state): + self.features = state['features'] + + # compatibility for pickles before FeatureUnion + self.pipeline = state.get('pipeline', + make_feature_union(state['features'])) + # compatibility shim for pickles created with sklearn-pandas<1.0.0 - self.features = [(columns, _build_transformer(transformers)) - for (columns, transformers) in state['features']] self.sparse = state.get('sparse', False) # compatibility shim for pickles created before ``default`` init # argument existed self.default = state.get('default', False) - def _get_col_subset(self, X, cols): - """ - Get a subset of columns from the given table X. - - X a Pandas dataframe; the table to select columns from - cols a string or list of strings representing the columns - to select - - Returns a numpy array with the data from the selected columns - """ - return_vector = False - if isinstance(cols, string_types): - return_vector = True - cols = [cols] - - if isinstance(X, list): - X = [x[cols] for x in X] - X = pd.DataFrame(X) - - elif isinstance(X, DataWrapper): - # if it's a datawrapper, unwrap it - X = X.df - - if return_vector: - t = X[cols[0]].values - else: - t = X[cols].values - - return t - def fit(self, X, y=None): """ Fit a transformation from the pipeline @@ -134,15 +89,15 @@ def fit(self, X, y=None): y the target vector relative to X, optional """ - for columns, transformers in self.features: - if transformers is not None: - _call_fit(transformers.fit, - self._get_col_subset(X, columns), y) + if self.pipeline is not None: + self.pipeline.fit(X, y) # handle features not explicitly selected - if self.default: # not False and not None - _call_fit(self.default.fit, - self._get_col_subset(X, self._unselected_columns(X)), y) + if self.default is not False: + # build JIT pipeline + default_features = [(self._unselected_columns(X), self.default)] + self.default_pipeline = make_feature_union(default_features) + self.default_pipeline.fit(X, y) return self def transform(self, X): @@ -152,22 +107,13 @@ def transform(self, X): X the data to transform """ extracted = [] - for columns, transformers in self.features: - # columns could be a string or list of - # strings; we don't care because pandas - # will handle either. - Xt = self._get_col_subset(X, columns) - if transformers is not None: - Xt = transformers.transform(Xt) - extracted.append(_handle_feature(Xt)) + if self.pipeline is not None: # some columns selected + extracted.append(handle_feature(self.pipeline.transform(X))) # handle features not explicitly selected if self.default is not False: - Xt = self._get_col_subset(X, self._unselected_columns(X)) - if self.default is not None: - Xt = self.default.transform(Xt) - extracted.append(_handle_feature(Xt)) - + Xt = self.default_pipeline.transform(X) + extracted.append(handle_feature(Xt)) # combine the feature outputs into one array. # at this point we lose track of which features @@ -186,3 +132,15 @@ def transform(self, X): stacked = np.hstack(extracted) return stacked + + def get_params(self, deep=True): + base_params = super(DataFrameMapper, self).get_params(deep=False) + if not deep: + return base_params + else: + fu_params = self.pipeline.get_params(deep=True) + fu_params.update(base_params) + return fu_params + + def set_params(self, **params): + return self.pipeline.set_params(**params) \ No newline at end of file diff --git a/sklearn_pandas/dataframe_mapper_pipeline.py b/sklearn_pandas/dataframe_mapper_pipeline.py new file mode 100644 index 0000000..184a0aa --- /dev/null +++ b/sklearn_pandas/dataframe_mapper_pipeline.py @@ -0,0 +1,76 @@ +''' +an alternative implementation which uses just sklearn Pipeline and FeatureUnion. +This makes the resultant transformer more compatible with other scikit-learn APIs. +''' +from sklearn.base import BaseEstimator, TransformerMixin +import pandas as pd +import numpy as np +from sklearn.pipeline import FeatureUnion +from .pipeline import TransformerPipeline + + +def _handle_feature(fea): + """ + Convert 1-dimensional arrays to 2-dimensional column vectors. + """ + if len(fea.shape) == 1: + fea = np.array([fea]).T + + return fea + + + + + + + +import unittest + + +class TestPipelineMapping(unittest.TestCase): + def setUp(self): + from sklearn.datasets import load_boston + data = load_boston() + fm = data['data'] + y = data['target'] + columns = data['feature_names'] + df = pd.DataFrame(fm, columns=columns) + self.df = df + self.y = y + from sklearn.preprocessing import StandardScaler + from sklearn.preprocessing import OneHotEncoder + self.mapping = [(['AGE'], StandardScaler()), + (['RAD'], OneHotEncoder(handle_unknown="ignore")) + ] + + def test_make_pipe(self): + try: + pipeline = mapping_to_pipeline(mapping=self.mapping) + except Exception as e: + self.fail('Unexpected exception raised:', e) + self.assertTrue(isinstance(pipeline, FeatureUnion)) + + def test_transform(self): + pipeline = mapping_to_pipeline(mapping=self.mapping) + n_unique = self.df.apply(lambda x: x.nunique()) + try: + transformed = pipeline.fit_transform(self.df) + except Exception as e: + self.fail('Unexpected exception raised:', e) + self.assertEqual(self.df.shape[0], transformed.shape[0]) + self.assertEqual(n_unique['RAD'] + 1, transformed.shape[1]) + + def test_pipe_cv(self): + pipeline = mapping_to_pipeline(mapping=self.mapping) + from sklearn.linear_model import LinearRegression + from sklearn.pipeline import Pipeline, make_pipeline + full_pipeline = make_pipeline(pipeline, LinearRegression()) + from sklearn.cross_validation import cross_val_score + try: + scores = cross_val_score(full_pipeline, self.df, self.y) + except Exception as e: + self.fail('Unexpected exception raised:', e) + + +if __name__ == '__main__': + unittest.main() diff --git a/sklearn_pandas/pipeline.py b/sklearn_pandas/pipeline.py index 13fa23a..d80b881 100644 --- a/sklearn_pandas/pipeline.py +++ b/sklearn_pandas/pipeline.py @@ -1,7 +1,10 @@ import six -from sklearn.pipeline import _name_estimators, Pipeline +import collections +from sklearn.pipeline import Pipeline, FeatureUnion, _name_estimators from sklearn.utils import tosequence +from .utils import PassThroughTransformer, ColumnSelectTransformer + def _call_fit(fit_method, X, y=None, **kwargs): """ @@ -56,6 +59,12 @@ def __init__(self, steps): "'%s' (type %s) doesn't)" % (estimator, type(estimator))) + # needed for compatibility with sklearn<=0.16, that doesn't have + # this property defined + @property + def named_steps(self): + return dict(self.steps) + def _pre_transform(self, X, y=None, **fit_params): fit_params_steps = dict((step, {}) for step, _ in self.steps) for pname, pval in six.iteritems(fit_params): @@ -86,7 +95,64 @@ def fit_transform(self, X, y=None, **fit_params): Xt, y, **fit_params).transform(Xt) -def make_transformer_pipeline(*steps): - """Construct a TransformerPipeline from the given estimators. +def make_transformer_pipeline(feature_selector, transformers, feature_name): + if not isinstance(transformers, list): + transformers = [transformers] + # transform None into PassThroughTransformer + transformers = [PassThroughTransformer() if t is None else t + for t in transformers] + cst = ColumnSelectTransformer + selector = [('selector', cst(feature_selector))] + # pipeline of selector followed by transformer + pipe = TransformerPipeline(selector + _name_estimators(transformers)) + return (feature_name, pipe) + + +def make_feature_union(mapping, n_jobs=1): + """ + Create a FeatureUnion from the specified mapping. + + Creates a FeatureUnion of TransformerPipelines that select the columns + given in the mapping as first step, then apply the specified transformers + sequentially. + + :param mapping: a list of tuples where the first is the column name(s) and + the other is the transormation or list of transformation to apply. + See ``DataFrameMapper`` for more information. + :param n_jobs: number of jobs to run in parallel (default 1) + """ + feature_names = get_feature_names(mapping) + # repeated feature names are not allowed, since they collide when + # doing set_params() + dupe_feat_names = [item for item, count in + collections.Counter(feature_names).items() if count > 1] + if len(dupe_feat_names): + raise ValueError( + 'Duplicated feature column names found: {}. Please ' + 'provide custom feature names to ' + 'disambiguate.'.format(dupe_feat_names)) + + feature_selectors = [el[0] for el in mapping] + transformers = [el[1] for el in mapping] + transformer_pipes = [ + make_transformer_pipeline(feature_selector, transformers, feature_name) + for feature_selector, transformers, feature_name in + zip(feature_selectors, transformers, feature_names)] + + if transformer_pipes: # at least one column to be transformed + feature_union = FeatureUnion(transformer_pipes, n_jobs=n_jobs) + else: # case when no columns were selected, but specifying default + feature_union = None + return feature_union + + +def get_feature_names(mapping): + """ + Derive feature names from given feature definition mapping. + + By default, it takes the string representation of selected column(s) names, + but a custom name can be provided as the third argument of the feature + definition tuple. """ - return TransformerPipeline(_name_estimators(steps)) + return [feat_def[2] if len(feat_def) == 3 else str(feat_def[0]) + for feat_def in mapping] \ No newline at end of file diff --git a/sklearn_pandas/utils.py b/sklearn_pandas/utils.py new file mode 100644 index 0000000..547a3cc --- /dev/null +++ b/sklearn_pandas/utils.py @@ -0,0 +1,53 @@ +import numpy as np +import pandas as pd + +from sklearn.base import BaseEstimator, TransformerMixin + + +def handle_feature(fea): + """ + Convert 1-dimensional arrays to 2-dimensional column vectors. + """ + if len(fea.shape) == 1: + fea = np.array([fea]).T + + return fea + + +class PassThroughTransformer(BaseEstimator, TransformerMixin): + """ + Transformer that just returns the selected column(s) oriented vertically. + """ + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + return handle_feature(X) + + +class ColumnSelectTransformer(BaseEstimator, TransformerMixin): + """ + A simple Transformer which selects a column or a group of columns from a + Pandas' DataFrame + """ + def __init__(self, column_name): + """ + A Transformer which selects a column or a group of columns from a Pandas' DataFrame + :param column_name: string or list of strings of columns to select + """ + self.column_name = column_name + + def fit(self, X, y=None): + if not (isinstance(X, pd.DataFrame) or isinstance(X, pd.Series)): + raise TypeError('Input should be a Pandas DataFrame or a Series (was %s)' % type(X)) + column_name = self.column_name + # in case in bracketed as [] to output a (n,1) rather (n,) shape + if not isinstance(column_name, list): + column_name = [column_name] + for name in column_name: + if name not in X.columns: + raise ValueError('Select column name %s is not in %s' % (name, X.columns)) + return self + + def transform(self, X, y=None): + return X[self.column_name].values diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py index 08454fb..fa715e0 100644 --- a/tests/test_dataframe_mapper.py +++ b/tests/test_dataframe_mapper.py @@ -1,5 +1,4 @@ import pytest -from pkg_resources import parse_version # In py3, mock is included with the unittest standard library # In py2, it's a separate package @@ -11,10 +10,9 @@ from pandas import DataFrame import pandas as pd from scipy import sparse -from sklearn import __version__ as sklearn_version -from sklearn.cross_validation import cross_val_score as sklearn_cv_score +from sklearn.cross_validation import cross_val_score from sklearn.datasets import load_iris -from sklearn.pipeline import Pipeline +from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder @@ -24,9 +22,8 @@ from numpy.testing import assert_array_equal import pickle -from sklearn_pandas import DataFrameMapper, cross_val_score -from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer -from sklearn_pandas.pipeline import TransformerPipeline +from sklearn_pandas.dataframe_mapper import DataFrameMapper +from sklearn_pandas.utils import handle_feature class MockXTransformer(object): @@ -77,38 +74,6 @@ def complex_dataframe(): 'feat2': [1, 2, 3, 2, 3, 4]}) -def test_nonexistent_columns_explicit_fail(simple_dataframe): - """ - If a nonexistent column is selected, KeyError is raised. - """ - mapper = DataFrameMapper(None) - with pytest.raises(KeyError): - mapper._get_col_subset(simple_dataframe, ['nonexistent_feature']) - - -def test_get_col_subset_single_column_array(simple_dataframe): - """ - Selecting a single column should return a 1-dimensional numpy array. - """ - mapper = DataFrameMapper(None) - array = mapper._get_col_subset(simple_dataframe, "a") - - assert type(array) == np.ndarray - assert array.shape == (len(simple_dataframe["a"]),) - - -def test_get_col_subset_single_column_list(simple_dataframe): - """ - Selecting a list of columns (even if the list contains a single element) - should return a 2-dimensional numpy array. - """ - mapper = DataFrameMapper(None) - array = mapper._get_col_subset(simple_dataframe, ["a"]) - - assert type(array) == np.ndarray - assert array.shape == (len(simple_dataframe["a"]), 1) - - def test_cols_string_array(simple_dataframe): """ If an string specified as the columns, the transformer @@ -144,7 +109,7 @@ def test_handle_feature_2dim(): 2-dimensional arrays are returned unchanged. """ array = np.array([[1, 2], [3, 4]]) - assert_array_equal(_handle_feature(array), array) + assert_array_equal(handle_feature(array), array) def test_handle_feature_1dim(): @@ -152,19 +117,7 @@ def test_handle_feature_1dim(): 1-dimensional arrays are converted to 2-dimensional column vectors. """ array = np.array([1, 2]) - assert_array_equal(_handle_feature(array), np.array([[1], [2]])) - - -def test_build_transformers(): - """ - When a list of transformers is passed, return a pipeline with - each element of the iterable as a step of the pipeline. - """ - transformers = [MockTClassifier(), MockTClassifier()] - pipeline = _build_transformer(transformers) - assert isinstance(pipeline, Pipeline) - for ix, transformer in enumerate(transformers): - assert pipeline.steps[ix][1] == transformer + assert_array_equal(handle_feature(array), np.array([[1], [2]])) def test_selected_columns(): @@ -228,7 +181,7 @@ def test_default_transformer(): mapper = DataFrameMapper([], default=Imputer()) transformed = mapper.fit_transform(df) - assert (transformed[: 0] == np.array([1., 2., 3.])).all() + assert (transformed[:0] == np.array([1., 2., 3.])).all() def test_list_transformers_single_arg(simple_dataframe): @@ -264,15 +217,23 @@ def test_list_transformers(): def test_list_transformers_old_unpickle(simple_dataframe): - mapper = DataFrameMapper(None) + mapper = DataFrameMapper([('a', [MockXTransformer()])]) # simulate the mapper was created with < 1.0.0 code mapper.features = [('a', [MockXTransformer()])] mapper_pickled = pickle.dumps(mapper) loaded_mapper = pickle.loads(mapper_pickled) - transformer = loaded_mapper.features[0][1] - assert isinstance(transformer, TransformerPipeline) - assert isinstance(transformer.steps[0][1], MockXTransformer) + assert isinstance(loaded_mapper.pipeline, FeatureUnion) + + +def test_list_transformers_nofeatunion_unpickle(simple_dataframe): + mapper = DataFrameMapper([('a', [MockXTransformer()])]) + # simulate the mapper was created with < 1.0.0 code + del mapper.pipeline + mapper_pickled = pickle.dumps(mapper) + + loaded_mapper = pickle.loads(mapper_pickled) + assert isinstance(loaded_mapper.pipeline, FeatureUnion) def test_default_old_unpickle(simple_dataframe): @@ -396,23 +357,26 @@ def test_with_car_dataframe(cars_dataframe): assert scores.mean() > 0.30 -@pytest.mark.skipIf(parse_version(sklearn_version) < parse_version('0.16')) -def test_direct_cross_validation(iris_dataframe): - """ - Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes. - See https://github.com/paulgb/sklearn-pandas/issues/11 - """ +def test_get_params(): pipeline = Pipeline([ ("preprocess", DataFrameMapper([ - ("petal length (cm)", None), - ("petal width (cm)", None), - ("sepal length (cm)", None), - ("sepal width (cm)", None), + ("description", CountVectorizer()), ])), ("classify", SVC(kernel='linear')) ]) - data = iris_dataframe.drop("species", axis=1) - labels = iris_dataframe["species"] - scores = sklearn_cv_score(pipeline, data, labels) - assert scores.mean() > 0.96 - assert (scores.std() * 2) < 0.04 + assert ('preprocess__description__countvectorizer__analyzer' in + pipeline.get_params()) + + +def test_set_params(): + pipeline = Pipeline([ + ("preprocess", DataFrameMapper([ + ("description", CountVectorizer()), + ])), + ("classify", SVC(kernel='linear')) + ]) + new_par = {'preprocess__description__countvectorizer__analyzer': 'another'} + pipeline.set_params(**new_par) + params = pipeline.get_params() + for k, v in new_par.items(): + assert params[k] == v \ No newline at end of file diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index ee57b57..f447197 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,5 +1,8 @@ +import pandas as pd import pytest -from sklearn_pandas.pipeline import TransformerPipeline, _call_fit + +from sklearn_pandas.pipeline import ( + TransformerPipeline, _call_fit, make_feature_union, get_feature_names) # In py3, mock is included with the unittest standard library # In py2, it's a separate package @@ -9,6 +12,11 @@ from mock import patch +@pytest.fixture +def df(): + return pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + + class NoTransformT(object): """Transformer without transform method. """ @@ -98,3 +106,26 @@ def test_raises_type_error(mock_fit): """ with pytest.raises(TypeError): _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg') + + +def test_make_feature_union_repeated_names(df): + mapping = [ + ('a', None), + ('a', None) + ] + with pytest.raises(ValueError): + mapper = make_feature_union(mapping) + + +def test_make_feature_union_repeated_names_custom_name(df): + mapping = [ + ('a', None), + ('a', None, 'a_dupe') + ] + mapper = make_feature_union(mapping) + + +def test_get_feature_names(): + mapping = [(['a'], None), ('b', None, 'beta')] + names = get_feature_names(mapping) + assert names == ["['a']", 'beta'] diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..24e18e7 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,56 @@ +import pandas as pd +import numpy as np + +import pytest + +from sklearn_pandas.utils import ( + ColumnSelectTransformer, PassThroughTransformer, handle_feature) + + +@pytest.fixture +def df(): + return pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + + +def test_column_select_1d(df): + cst = ColumnSelectTransformer('a') + selected = cst.fit_transform(df) + assert (selected == np.array([1, 2])).all() + + +def test_column_select_2d(df): + cst = ColumnSelectTransformer(['a']) + selected = cst.fit_transform(df) + assert (selected == np.array([[1], [2]])).all() + + +def test_column_select_nonframe(): + """ + ColumnSelectTransformer only works with Series or DataFrames. + """ + cst = ColumnSelectTransformer('a') + with pytest.raises(TypeError): + cst.fit_transform({}) + + +def test_column_select_nonexistent(df): + """ + Trying to select an unexistent column raises ValueError. + """ + cst = ColumnSelectTransformer(['z']) + with pytest.raises(ValueError): + selected = cst.fit_transform(df) + + +def test_passthrough_transformer(df): + pt = PassThroughTransformer() + result = pt.fit_transform(np.array([1, 2])) + assert (result == np.array([[1], [2]])).all() + + +def test_handle_feature(): + feature = np.array([1, 2, 3]) + assert handle_feature(feature).shape == (3, 1) + + feature = np.array([[1], [2], [3]]) + assert handle_feature(feature).shape == (3, 1) \ No newline at end of file