diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index f180443a508b13..1091df54d75ccd 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -546,6 +546,7 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) .. _whatsnew_0240.deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3af753b05ed58a..db8aa4fc640b5c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -61,7 +61,7 @@ is_named_tuple) from pandas.core.dtypes.concat import _get_sliced_frame_result_type from pandas.core.dtypes.missing import isna, notna - +from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, ensure_index, @@ -3898,6 +3898,22 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if not isinstance(keys, list): keys = [keys] + missing = [] + for x in keys: + if not (is_scalar(x) or isinstance(x, tuple)): + if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex, + list, np.ndarray)): + raise TypeError('keys may only contain a combination of ' + 'the following: valid column keys, ' + 'Series, Index, MultiIndex, list or ' + 'np.ndarray') + else: + if x not in self: + missing.append(x) + + if missing: + raise KeyError('{}'.format(missing)) + vi = verify_integrity return super(DataFrame, self).set_index(keys=keys, drop=drop, append=append, inplace=inplace, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a47d356ce9e0eb..9dbb464275ce04 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -786,7 +786,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, raise ValueError('Index has duplicate keys: {dup}'.format( dup=duplicates)) - for c in to_remove: + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): del obj[c] # clear up memory usage diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 4e61c9c62266df..8c635b50a7c252 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -186,18 +186,19 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # == gives ambiguous Boolean for Series if drop and keys[0] is 'A' and keys[1] is 'A': - with tm.assert_raises_regex(KeyError, '.*'): - df.set_index(keys, drop=drop, append=append) + # can't drop same column twice + first_drop = False else: - result = df.set_index(keys, drop=drop, append=append) + first_drop = drop - # to test against already-tested behavior, we add sequentially, - # hence second append always True; must wrap in list, otherwise - # list-box will be illegal - expected = df.set_index([keys[0]], drop=drop, append=append) - expected = expected.set_index([keys[1]], drop=drop, append=True) + # to test against already-tested behaviour, we add sequentially, + # hence second append always True; must wrap in list, otherwise + # list-box will be illegal + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) - tm.assert_frame_equal(result, expected) + result = df.set_index(keys, drop=drop, append=append) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) @@ -229,13 +230,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols): def test_set_index_raise(self, frame_of_index_cols, drop, append): df = frame_of_index_cols - with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E + with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"): + # column names are A-E df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append) # non-existent key in list with arrays - with tm.assert_raises_regex(KeyError, '.*'): + with tm.assert_raises_regex(KeyError, 'X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) + rgx = 'keys may only contain a combination of the following:.*' + # forbidden type, e.g. set + with tm.assert_raises_regex(TypeError, rgx): + df.set_index(set(df['A']), drop=drop, append=append) + + # forbidden type in list, e.g. set + with tm.assert_raises_regex(TypeError, rgx): + df.set_index(['A', df['A'], set(df['A'])], + drop=drop, append=append) + def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) ci.name = 'B'