Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: better error-handling for df.set_index #22486

Merged
merged 16 commits into from
Oct 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,8 @@ Other API Changes
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)

Expand Down
55 changes: 38 additions & 17 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
is_sequence,
is_named_tuple)
from pandas.core.dtypes.concat import _get_sliced_frame_result_type
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
from pandas.core.dtypes.missing import isna, notna

from pandas.core import algorithms
Expand Down Expand Up @@ -3980,6 +3981,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if not isinstance(keys, list):
keys = [keys]

missing = []
for col in keys:
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
# tuples can be both column keys or list-likes
# if they are valid column keys, everything is fine
continue
elif is_scalar(col) and col not in self:
# tuples that are not column keys are considered list-like,
# not considered missing
missing.append(col)
elif (not is_list_like(col, allow_sets=False)
or getattr(col, 'ndim', 1) > 1):
raise TypeError('The parameter "keys" may only contain a '
'combination of valid column keys and '
'one-dimensional list-likes')

if missing:
raise KeyError('{}'.format(missing))

if inplace:
frame = self
else:
Expand All @@ -3989,37 +4009,37 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, MultiIndex):
if isinstance(self.index, ABCMultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index._get_level_values(i))
else:
arrays.append(self.index)

to_remove = []
for col in keys:
if isinstance(col, MultiIndex):
# append all but the last column so we don't have to modify
# the end of this loop
for n in range(col.nlevels - 1):
if isinstance(col, ABCMultiIndex):
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))

level = col._get_level_values(col.nlevels - 1)
names.extend(col.names)
elif isinstance(col, Series):
level = col._values
names.append(col.name)
elif isinstance(col, Index):
level = col
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, (list, np.ndarray, Index)):
level = col
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
level = frame[col]._values
arrays.append(frame[col]._values)
names.append(col)
if drop:
to_remove.append(col)
arrays.append(level)

index = ensure_index_from_sequences(arrays, names)

Expand All @@ -4028,7 +4048,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

for c in to_remove:
# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del frame[c]

# clear up memory usage
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/frame/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,13 @@ def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing

Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
are jointly unique), the rest are unique.
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
"""
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
'C': ['a', 'b', 'c', 'd', 'e'],
'D': np.random.randn(5),
'E': np.random.randn(5)})
'E': np.random.randn(5),
('tuple', 'as', 'label'): np.random.randn(5)})
return df
87 changes: 57 additions & 30 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def test_set_index_cast(self):
tm.assert_frame_equal(df, df2)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('inplace', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_drop_inplace(self, frame_of_index_cols,
Expand All @@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols,
tm.assert_frame_equal(result, expected)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append(self, frame_of_index_cols, drop, keys):
df = frame_of_index_cols
Expand All @@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys):
tm.assert_frame_equal(result, expected)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append_to_multiindex(self, frame_of_index_cols,
drop, keys):
Expand All @@ -114,8 +117,10 @@ def test_set_index_after_mutation(self):
tm.assert_frame_equal(result, expected)

# MultiIndex constructor does not work directly on Series -> lambda
# Add list-of-list constructor because list is ambiguous -> lambda
# also test index name if append=True (name is duplicate here for B)
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter, lambda x: [list(x)],
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name', [(True, None),
(True, 'B'), (True, 'test'), (False, None)])
Expand All @@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
df.index.name = index_name

key = box(df['B'])
# np.array and list "forget" the name of B
name = [None if box in [np.array, list] else 'B']
if box == list:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why exactly is a list interpreted differently here? this makes this test really really odd. I am worried something changed here, as this appears to work just fine in the master.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You will see above (line 123) that list wasn't tested - for precisely this reason. df.set_index(['A', 'B']) interprets a A & B as column keys, so (assuming this df was length 2) it would not use the list ['A', 'B'] as the index. To do that, one would have to pass [['A', 'B']]. This PR proposes to add tests for the current behaviour.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback
In case the above was not very clearly worded, this corresponds exactly to behaviour on master:

>>> df = pd.DataFrame(np.random.randn(2,3))
>>> df.set_index(['A', 'B'])
KeyError: 'A'
>>> df.set_index([['A', 'B']])
          0         1         2
A  1.962370 -1.150770  0.843600
B -0.417274  0.509781 -0.697802

# list of strings gets interpreted as list of keys
msg = "['one', 'two', 'three', 'one', 'two']"
with tm.assert_raises_regex(KeyError, msg):
df.set_index(key, drop=drop, append=append)
else:
# np.array/tuple/iter/list-of-list "forget" the name of B
name_mi = getattr(key, 'names', None)
name = [getattr(key, 'name', None)] if name_mi is None else name_mi

result = df.set_index(key, drop=drop, append=append)
result = df.set_index(key, drop=drop, append=append)

# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name
# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# MultiIndex constructor does not work directly on Series -> lambda
# also test index name if append=True (name is duplicate here for A & B)
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name',
[(True, None), (True, 'A'), (True, 'B'),
Expand All @@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
df.index.name = index_name

keys = ['A', box(df['B'])]
# np.array and list "forget" the name of B
names = ['A', None if box in [np.array, list] else 'B']
# np.array/list/tuple/iter "forget" the name of B
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']

result = df.set_index(keys, drop=drop, append=append)

Expand All @@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
# MultiIndex constructor does not work directly on Series -> lambda
# We also emulate a "constructor" for the label -> lambda
# also test index name if append=True (name is duplicate here for A)
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
@pytest.mark.parametrize('box2', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
@pytest.mark.parametrize('box1', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('append, index_name', [(True, None),
Expand All @@ -183,21 +198,22 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
df.index.name = index_name

keys = [box1(df['A']), box2(df['A'])]
result = df.set_index(keys, drop=drop, append=append)

# == gives ambiguous Boolean for Series
if drop and keys[0] is 'A' and keys[1] is 'A':
with tm.assert_raises_regex(KeyError, '.*'):
df.set_index(keys, drop=drop, append=append)
else:
result = df.set_index(keys, drop=drop, append=append)
# if either box was iter, the content has been consumed; re-read it
keys = [box1(df['A']), box2(df['A'])]

# to test against already-tested behavior, we add sequentially,
# hence second append always True; must wrap in list, otherwise
# list-box will be illegal
expected = df.set_index([keys[0]], drop=drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
# need to adapt first drop for case that both keys are 'A' --
# cannot drop the same column twice;
# use "is" because == would give ambiguous Boolean error for containers
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop

tm.assert_frame_equal(result, expected)
# to test against already-tested behaviour, we add sequentially,
# hence second append always True; must wrap keys in list, otherwise
# box = list would be illegal
expected = df.set_index([keys[0]], drop=first_drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
Expand Down Expand Up @@ -229,13 +245,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
def test_set_index_raise(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols

with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
# column names are A-E, as well as one tuple
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)

# non-existent key in list with arrays
with tm.assert_raises_regex(KeyError, '.*'):
with tm.assert_raises_regex(KeyError, 'X'):
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)

msg = 'The parameter "keys" may only contain a combination of.*'
# forbidden type, e.g. set
with tm.assert_raises_regex(TypeError, msg):
df.set_index(set(df['A']), drop=drop, append=append)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you singling out an iterable (set) is there some reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, just any of the types that are not allowed (I've tried to indicate as much by using "e.g.")


# forbidden type in list, e.g. set
with tm.assert_raises_regex(TypeError, msg):
df.set_index(['A', df['A'], set(df['A'])],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see my comment above, I am not sure a set is specifically excluded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am proposing to exclude sets (see above). Irrespective of that, I need to test raising the TypeError for forbidden types (which type exactly is less relevant)

drop=drop, append=append)

def test_construction_with_categorical_index(self):
ci = tm.makeCategoricalIndex(10)
ci.name = 'B'
Expand Down