Skip to content

Commit

Permalink
Rebased version of pandas-dev#22486
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Oct 16, 2018
1 parent 99bae05 commit 8ab863b
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 37 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,8 @@ Other API Changes
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message,
and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)

Expand Down
24 changes: 22 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.config import get_option

from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import (Index, MultiIndex, ensure_index,
ensure_index_from_sequences)
Expand Down Expand Up @@ -3963,7 +3964,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
7 2013 84
10 2014 31
Create a multi-index using columns 'year' and 'month':
Create a MultiIndex using columns 'year' and 'month':
>>> df.set_index(['year', 'month'])
sale
Expand All @@ -3973,7 +3974,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
2013 7 84
2014 10 31
Create a multi-index using a set of values and a column:
Create a MultiIndex using a set of values and a column:
>>> df.set_index([[1, 2, 3, 4], 'year'])
month sale
Expand All @@ -3986,6 +3987,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if not isinstance(keys, list):
keys = [keys]

missing = []
for col in keys:
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
# tuples can be both column keys or list-likes
# if they are valid column keys, everything is fine
continue
elif is_scalar(col) and col not in self:
# tuples that are not column keys are considered list-like,
# not considered missing
missing.append(col)
elif (not is_list_like(col) or isinstance(col, set)
or getattr(col, 'ndim', 1) > 1):
raise TypeError('The parameter "keys" may only contain a '
'combination of valid column keys and '
'one-dimensional list-likes')

if missing:
raise KeyError('{}'.format(missing))

vi = verify_integrity
return super(DataFrame, self).set_index(keys=keys, drop=drop,
append=append, inplace=inplace,
Expand Down
23 changes: 13 additions & 10 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
7 2013 84
10 2014 31
Create a multi-index using columns 'year' and 'month':
Create a MultiIndex using columns 'year' and 'month':
>>> df.set_index(['year', 'month'])
sale
Expand All @@ -709,7 +709,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
2013 7 84
2014 10 31
Create a multi-index using a set of values and a column:
Create a MultiIndex using a set of values and a column:
>>> df.set_index([[1, 2, 3, 4], 'year'])
month sale
Expand Down Expand Up @@ -741,18 +741,20 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))
names.extend(col.names)
elif isinstance(col, ABCIndexClass):
# Index but not MultiIndex (treated above)
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, ABCSeries):
arrays.append(col._values)
names.append(col.name)
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
# from here, col can only be a column label (and obj a DataFrame);
# see checks in Series.set_index and DataFrame.set_index
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(obj[col]._values)
names.append(col)
Expand All @@ -766,7 +768,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

for c in to_remove:
# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del obj[c]

# clear up memory usage
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1136,15 +1136,15 @@ def set_index(self, arrays, append=False, inplace=False,
c 12
dtype: int64
Create a multi-index by appending to the existing index:
Create a MultiIndex by appending to the existing index:
>>> s.set_index(['a', 'b', 'c'], append=True)
0 a 10
1 b 11
2 c 12
dtype: int64
Create a multi-index by passing a list of arrays:
Create a MultiIndex by passing a list of arrays:
>>> t = (s ** 2).set_index([['a', 'b', 'c'], ['I', 'II', 'III']])
>>> t
Expand All @@ -1166,11 +1166,11 @@ def set_index(self, arrays, append=False, inplace=False,
elif all(is_scalar(x) for x in arrays):
arrays = [arrays]

if any(not isinstance(x, (ABCSeries, ABCIndexClass, list, np.ndarray))
for x in arrays):
raise TypeError('arrays must be Series, Index, MultiIndex, list, '
'np.ndarray or list containing only Series, '
'Index, MultiIndex, list, np.ndarray')
if any(not is_list_like(x) or isinstance(x, set)
or getattr(x, 'ndim', 1) > 1 for x in arrays):
raise TypeError('The parameter "arrays" may only contain a '
'combination of valid column keys and '
'one-dimensional list-likes')

return super(Series, self).set_index(keys=arrays, drop=False,
append=append, inplace=inplace,
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/frame/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,13 @@ def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing
Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
are jointly unique), the rest are unique.
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
"""
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
'C': ['a', 'b', 'c', 'd', 'e'],
'D': np.random.randn(5),
'E': np.random.randn(5)})
'E': np.random.randn(5),
('tuple', 'as', 'label'): np.random.randn(5)})
return df
34 changes: 23 additions & 11 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,18 +186,19 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,

# == gives ambiguous Boolean for Series
if drop and keys[0] is 'A' and keys[1] is 'A':
with tm.assert_raises_regex(KeyError, '.*'):
df.set_index(keys, drop=drop, append=append)
# can't drop same column twice
first_drop = False
else:
result = df.set_index(keys, drop=drop, append=append)
first_drop = drop

# to test against already-tested behavior, we add sequentially,
# hence second append always True; must wrap in list, otherwise
# list-box will be illegal
expected = df.set_index([keys[0]], drop=drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
# to test against already-tested behaviour, we add sequentially,
# hence second append always True; must wrap in list, otherwise
# list-box will be illegal
expected = df.set_index([keys[0]], drop=first_drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)

tm.assert_frame_equal(result, expected)
result = df.set_index(keys, drop=drop, append=append)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
Expand Down Expand Up @@ -229,13 +230,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
def test_set_index_raise(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols

with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
# column names are A-E
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)

# non-existent key in list with arrays
with tm.assert_raises_regex(KeyError, '.*'):
with tm.assert_raises_regex(KeyError, 'X'):
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)

msg = 'The parameter "keys" may only contain a combination of.*'
# forbidden type, e.g. set
with tm.assert_raises_regex(TypeError, msg):
df.set_index(set(df['A']), drop=drop, append=append)

# forbidden type in list, e.g. set
with tm.assert_raises_regex(TypeError, msg):
df.set_index(['A', df['A'], set(df['A'])],
drop=drop, append=append)

def test_construction_with_categorical_index(self):
ci = tm.makeCategoricalIndex(10)
ci.name = 'B'
Expand Down
9 changes: 5 additions & 4 deletions pandas/tests/series/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,14 @@ def test_set_index_verify_integrity(self, string_series):
string_series.set_index([idx, idx], verify_integrity=True)

def test_set_index_raise(self, string_series):
# wrong type: iterator
with tm.assert_raises_regex(TypeError, 'arrays must be.*'):
string_series.set_index(iter(string_series.index),
msg = 'The parameter "arrays" may only contain a combination.*'
# forbidden type, e.g. set
with tm.assert_raises_regex(TypeError, msg):
string_series.set_index(set(string_series.index),
verify_integrity=True)

# wrong type in list with arrays
with tm.assert_raises_regex(TypeError, 'arrays must be.*'):
with tm.assert_raises_regex(TypeError, msg):
string_series.set_index([string_series.index, 'X'],
verify_integrity=True)

Expand Down

0 comments on commit 8ab863b

Please sign in to comment.